From 8e9895e0ce6ee51c0ac948ef9d1d0d66b4256a57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Moreo=20Fern=C3=A1ndez?= Date: Mon, 7 Jan 2019 12:30:05 +0100 Subject: [PATCH] trying to merge --- src/data/features.py | 2 +- src/model.py | 22 ++++++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/data/features.py b/src/data/features.py index 966e2c1..8c69924 100644 --- a/src/data/features.py +++ b/src/data/features.py @@ -342,7 +342,7 @@ class FeatureExtractor: self.split_policy.__name__)) print('number of training (full) documents: {}'.format(n_original_docs)) print('X shape (#documents,#features): {}'.format(X.shape)) - print('y prevalence: {:.2f}%'.format(y.mean() * 100)) + print('y prevalence: {}/{} {:.2f}%'.format(y.sum(),len(y),y.mean() * 100)) print() return X, y diff --git a/src/model.py b/src/model.py index a80bd85..a7e4c59 100644 --- a/src/model.py +++ b/src/model.py @@ -1,7 +1,7 @@ from util import disable_sklearn_warnings from sklearn.metrics import f1_score from sklearn.metrics import make_scorer -from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import GridSearchCV, LeaveOneOut from sklearn.linear_model import LogisticRegression from sklearn.svm import * from data.features import * @@ -13,6 +13,19 @@ class RandomVerificator: def predict(self,test): return np.random.rand() +def f1(true_labels, predicted_labels): + assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels." + nd = len(true_labels) + tp = np.sum(predicted_labels[true_labels==1]) + fp = np.sum(predicted_labels[true_labels == 0]) + fn = np.sum(true_labels[predicted_labels == 0]) + num = 2.0 * tp + den = 2.0 * tp + fp + fn + if den > 0: return num / den + # we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative + return 1.0 + + class AuthorshipVerificator: def __init__(self, nfolds=10, @@ -34,9 +47,10 @@ class AuthorshipVerificator: def fit(self,X,y): if not isinstance(y,np.ndarray): y=np.array(y) positive_examples = y.sum() - if positive_examples >= self.nfolds: + if True or positive_examples >= self.nfolds: print('optimizing {}'.format(self.svm.__class__.__name__)) - self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1_score), n_jobs=-1) + self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=LeaveOneOut(), scoring=make_scorer(f1), n_jobs=-1, verbose=10) + # self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1), n_jobs=-1, verbose=10) else: self.estimator = self.svm @@ -47,7 +61,7 @@ class AuthorshipVerificator: print('computing the cross-val score') f1scores = self.estimator.best_score_ f1_mean, f1_std = f1scores.mean(), f1scores.std() - print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std)) + print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores)) return self