trying to merge
This commit is contained in:
parent
04a0ef1ee4
commit
8e9895e0ce
|
|
@ -342,7 +342,7 @@ class FeatureExtractor:
|
||||||
self.split_policy.__name__))
|
self.split_policy.__name__))
|
||||||
print('number of training (full) documents: {}'.format(n_original_docs))
|
print('number of training (full) documents: {}'.format(n_original_docs))
|
||||||
print('X shape (#documents,#features): {}'.format(X.shape))
|
print('X shape (#documents,#features): {}'.format(X.shape))
|
||||||
print('y prevalence: {:.2f}%'.format(y.mean() * 100))
|
print('y prevalence: {}/{} {:.2f}%'.format(y.sum(),len(y),y.mean() * 100))
|
||||||
print()
|
print()
|
||||||
|
|
||||||
return X, y
|
return X, y
|
||||||
|
|
|
||||||
22
src/model.py
22
src/model.py
|
|
@ -1,7 +1,7 @@
|
||||||
from util import disable_sklearn_warnings
|
from util import disable_sklearn_warnings
|
||||||
from sklearn.metrics import f1_score
|
from sklearn.metrics import f1_score
|
||||||
from sklearn.metrics import make_scorer
|
from sklearn.metrics import make_scorer
|
||||||
from sklearn.model_selection import GridSearchCV
|
from sklearn.model_selection import GridSearchCV, LeaveOneOut
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.svm import *
|
from sklearn.svm import *
|
||||||
from data.features import *
|
from data.features import *
|
||||||
|
|
@ -13,6 +13,19 @@ class RandomVerificator:
|
||||||
def predict(self,test):
|
def predict(self,test):
|
||||||
return np.random.rand()
|
return np.random.rand()
|
||||||
|
|
||||||
|
def f1(true_labels, predicted_labels):
|
||||||
|
assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels."
|
||||||
|
nd = len(true_labels)
|
||||||
|
tp = np.sum(predicted_labels[true_labels==1])
|
||||||
|
fp = np.sum(predicted_labels[true_labels == 0])
|
||||||
|
fn = np.sum(true_labels[predicted_labels == 0])
|
||||||
|
num = 2.0 * tp
|
||||||
|
den = 2.0 * tp + fp + fn
|
||||||
|
if den > 0: return num / den
|
||||||
|
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
|
||||||
class AuthorshipVerificator:
|
class AuthorshipVerificator:
|
||||||
|
|
||||||
def __init__(self, nfolds=10,
|
def __init__(self, nfolds=10,
|
||||||
|
|
@ -34,9 +47,10 @@ class AuthorshipVerificator:
|
||||||
def fit(self,X,y):
|
def fit(self,X,y):
|
||||||
if not isinstance(y,np.ndarray): y=np.array(y)
|
if not isinstance(y,np.ndarray): y=np.array(y)
|
||||||
positive_examples = y.sum()
|
positive_examples = y.sum()
|
||||||
if positive_examples >= self.nfolds:
|
if True or positive_examples >= self.nfolds:
|
||||||
print('optimizing {}'.format(self.svm.__class__.__name__))
|
print('optimizing {}'.format(self.svm.__class__.__name__))
|
||||||
self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
|
self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=LeaveOneOut(), scoring=make_scorer(f1), n_jobs=-1, verbose=10)
|
||||||
|
# self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1), n_jobs=-1, verbose=10)
|
||||||
else:
|
else:
|
||||||
self.estimator = self.svm
|
self.estimator = self.svm
|
||||||
|
|
||||||
|
|
@ -47,7 +61,7 @@ class AuthorshipVerificator:
|
||||||
print('computing the cross-val score')
|
print('computing the cross-val score')
|
||||||
f1scores = self.estimator.best_score_
|
f1scores = self.estimator.best_score_
|
||||||
f1_mean, f1_std = f1scores.mean(), f1scores.std()
|
f1_mean, f1_std = f1scores.mean(), f1scores.std()
|
||||||
print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
|
print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores))
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue