trying to merge

2019-01-07 12:30:05 +01:00 · 2019-01-07 12:30:05 +01:00 · 8e9895e0ce
parent 04a0ef1ee4
commit 8e9895e0ce
2 changed files with 19 additions and 5 deletions
--- a/src/data/features.py
+++ b/src/data/features.py
@ -342,7 +342,7 @@ class FeatureExtractor:
                        self.split_policy.__name__))
            print('number of training (full) documents: {}'.format(n_original_docs))
            print('X shape (#documents,#features): {}'.format(X.shape))
-            print('y prevalence: {:.2f}%'.format(y.mean() * 100))
+            print('y prevalence: {}/{} {:.2f}%'.format(y.sum(),len(y),y.mean() * 100))
            print()

        return X, y
--- a/src/model.py
+++ b/src/model.py
@ -1,7 +1,7 @@
 from util import disable_sklearn_warnings
 from sklearn.metrics import f1_score
 from sklearn.metrics import make_scorer
-from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import GridSearchCV, LeaveOneOut
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import *
 from data.features import *
@ -13,6 +13,19 @@ class RandomVerificator:
    def predict(self,test):
        return np.random.rand()

+def f1(true_labels, predicted_labels):
+    assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels."
+    nd = len(true_labels)
+    tp = np.sum(predicted_labels[true_labels==1])
+    fp = np.sum(predicted_labels[true_labels == 0])
+    fn = np.sum(true_labels[predicted_labels == 0])
+    num = 2.0 * tp
+    den = 2.0 * tp + fp + fn
+    if den > 0: return num / den
+    # we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
+    return 1.0
+
+
 class AuthorshipVerificator:

    def __init__(self, nfolds=10,
@ -34,9 +47,10 @@ class AuthorshipVerificator:
    def fit(self,X,y):
        if not isinstance(y,np.ndarray): y=np.array(y)
        positive_examples = y.sum()
-        if positive_examples >= self.nfolds:
+        if True or positive_examples >= self.nfolds:
            print('optimizing {}'.format(self.svm.__class__.__name__))
-            self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
+            self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=LeaveOneOut(), scoring=make_scorer(f1), n_jobs=-1, verbose=10)
+            # self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1), n_jobs=-1, verbose=10)
        else:
            self.estimator = self.svm

@ -47,7 +61,7 @@ class AuthorshipVerificator:
            print('computing the cross-val score')
            f1scores = self.estimator.best_score_
            f1_mean, f1_std = f1scores.mean(), f1scores.std()
-            print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
+            print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores))

        return self