dante-verification/src/model.py

88 lines
3.6 KiB
Python
Executable File

from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import *
from data.features import *
from util.evaluation import f1, get_counters
class AuthorshipVerificator:
def __init__(self, nfolds=10,
params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]},
author_name=None):
self.nfolds = nfolds
self.params = params
self.author_name = author_name if author_name else 'this author'
self.classifier = LogisticRegression()
def fit(self, X, y):
y = np.asarray(y)
positive_examples = y.sum()
if positive_examples >= self.nfolds:
print('optimizing {}'.format(self.classifier.__class__.__name__))
folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
self.estimator = GridSearchCV(
self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1
)
else:
self.estimator = self.classifier
self.estimator.fit(X, y)
if isinstance(self.estimator, GridSearchCV):
f1_mean = self.estimator.best_score_.mean()
print(f'Best params: {self.estimator.best_params_} (cross-validation F1={f1_mean:.3f})')
self.estimator = self.estimator.best_estimator_
return self
def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
if groups is None:
print('Computing LOO without groups')
folds = list(LeaveOneOut().split(X, y))
else:
print('Computing LOO with groups')
logo = LeaveOneGroupOut()
folds = list(logo.split(X, y, groups))
if test_lowest_index_only:
print('ignoring fragments')
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
missclassified = '\n'.join(files[scores == 0].tolist())
print('missclassified texts:')
print(missclassified)
if counters and test_lowest_index_only:
yfull_true = y[:len(folds)]
yfull_predict = np.zeros_like(yfull_true)
yfull_predict[scores == 1] = yfull_true[scores == 1]
yfull_predict[scores != 1] = 1-yfull_true[scores != 1]
tp, fp, fn, tn = get_counters(yfull_true, yfull_predict)
return scores.mean(), scores.std(), tp, fp, fn, tn
else:
return scores.mean(), scores.std()
def predict(self, test):
pred = self.estimator.predict(test)
full_doc_prediction = pred[0]
if len(pred) > 1:
fragment_predictions = pred[1:]
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
return full_doc_prediction, fragment_predictions
return full_doc_prediction
def predict_proba(self, test):
assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
pred = self.estimator.predict_proba(test)
full_doc_prediction = pred[0,1]
if len(pred) > 1:
fragment_predictions = pred[1:,1]
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
return full_doc_prediction, fragment_predictions
return full_doc_prediction, []