trying to merge
This commit is contained in:
parent
04a0ef1ee4
commit
8e9895e0ce
|
|
@ -342,7 +342,7 @@ class FeatureExtractor:
|
|||
self.split_policy.__name__))
|
||||
print('number of training (full) documents: {}'.format(n_original_docs))
|
||||
print('X shape (#documents,#features): {}'.format(X.shape))
|
||||
print('y prevalence: {:.2f}%'.format(y.mean() * 100))
|
||||
print('y prevalence: {}/{} {:.2f}%'.format(y.sum(),len(y),y.mean() * 100))
|
||||
print()
|
||||
|
||||
return X, y
|
||||
|
|
|
|||
22
src/model.py
22
src/model.py
|
|
@ -1,7 +1,7 @@
|
|||
from util import disable_sklearn_warnings
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.metrics import make_scorer
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.model_selection import GridSearchCV, LeaveOneOut
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.svm import *
|
||||
from data.features import *
|
||||
|
|
@ -13,6 +13,19 @@ class RandomVerificator:
|
|||
def predict(self,test):
|
||||
return np.random.rand()
|
||||
|
||||
def f1(true_labels, predicted_labels):
|
||||
assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels."
|
||||
nd = len(true_labels)
|
||||
tp = np.sum(predicted_labels[true_labels==1])
|
||||
fp = np.sum(predicted_labels[true_labels == 0])
|
||||
fn = np.sum(true_labels[predicted_labels == 0])
|
||||
num = 2.0 * tp
|
||||
den = 2.0 * tp + fp + fn
|
||||
if den > 0: return num / den
|
||||
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
|
||||
return 1.0
|
||||
|
||||
|
||||
class AuthorshipVerificator:
|
||||
|
||||
def __init__(self, nfolds=10,
|
||||
|
|
@ -34,9 +47,10 @@ class AuthorshipVerificator:
|
|||
def fit(self,X,y):
|
||||
if not isinstance(y,np.ndarray): y=np.array(y)
|
||||
positive_examples = y.sum()
|
||||
if positive_examples >= self.nfolds:
|
||||
if True or positive_examples >= self.nfolds:
|
||||
print('optimizing {}'.format(self.svm.__class__.__name__))
|
||||
self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
|
||||
self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=LeaveOneOut(), scoring=make_scorer(f1), n_jobs=-1, verbose=10)
|
||||
# self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1), n_jobs=-1, verbose=10)
|
||||
else:
|
||||
self.estimator = self.svm
|
||||
|
||||
|
|
@ -47,7 +61,7 @@ class AuthorshipVerificator:
|
|||
print('computing the cross-val score')
|
||||
f1scores = self.estimator.best_score_
|
||||
f1_mean, f1_std = f1scores.mean(), f1scores.std()
|
||||
print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
|
||||
print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores))
|
||||
|
||||
return self
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue