trying to merge

This commit is contained in:
Alejandro Moreo Fernandez 2019-01-07 12:30:05 +01:00
parent 04a0ef1ee4
commit 8e9895e0ce
2 changed files with 19 additions and 5 deletions

View File

@ -342,7 +342,7 @@ class FeatureExtractor:
self.split_policy.__name__))
print('number of training (full) documents: {}'.format(n_original_docs))
print('X shape (#documents,#features): {}'.format(X.shape))
print('y prevalence: {:.2f}%'.format(y.mean() * 100))
print('y prevalence: {}/{} {:.2f}%'.format(y.sum(),len(y),y.mean() * 100))
print()
return X, y

View File

@ -1,7 +1,7 @@
from util import disable_sklearn_warnings
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.svm import *
from data.features import *
@ -13,6 +13,19 @@ class RandomVerificator:
def predict(self,test):
return np.random.rand()
def f1(true_labels, predicted_labels):
assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels."
nd = len(true_labels)
tp = np.sum(predicted_labels[true_labels==1])
fp = np.sum(predicted_labels[true_labels == 0])
fn = np.sum(true_labels[predicted_labels == 0])
num = 2.0 * tp
den = 2.0 * tp + fp + fn
if den > 0: return num / den
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
return 1.0
class AuthorshipVerificator:
def __init__(self, nfolds=10,
@ -34,9 +47,10 @@ class AuthorshipVerificator:
def fit(self,X,y):
if not isinstance(y,np.ndarray): y=np.array(y)
positive_examples = y.sum()
if positive_examples >= self.nfolds:
if True or positive_examples >= self.nfolds:
print('optimizing {}'.format(self.svm.__class__.__name__))
self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=LeaveOneOut(), scoring=make_scorer(f1), n_jobs=-1, verbose=10)
# self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1), n_jobs=-1, verbose=10)
else:
self.estimator = self.svm
@ -47,7 +61,7 @@ class AuthorshipVerificator:
print('computing the cross-val score')
f1scores = self.estimator.best_score_
f1_mean, f1_std = f1scores.mean(), f1scores.std()
print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores))
return self