dante-verification/src/model.py

131 lines
5.4 KiB
Python
Executable File

from util import disable_sklearn_warnings
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, GroupKFold, KFold, \
StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import *
from data.features import *
class RandomVerificator:
def __init__(self): pass
def fit(self,positives,negatives):
pass
def predict(self,test):
return np.random.rand()
def get_counters(true_labels, predicted_labels):
assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels."
nd = len(true_labels)
tp = np.sum(predicted_labels[true_labels == 1])
fp = np.sum(predicted_labels[true_labels == 0])
fn = np.sum(true_labels[predicted_labels == 0])
tn = nd - (tp+fp+fn)
return tp,fp,fn,tn
def f1_from_counters(tp,fp,fn,tn):
num = 2.0 * tp
den = 2.0 * tp + fp + fn
if den > 0: return num / den
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
return 1.0
def f1(true_labels, predicted_labels):
tp, fp, fn, tn = get_counters(true_labels,predicted_labels)
return f1_from_counters(tp, fp, fn, tn )
class AuthorshipVerificator:
def __init__(self, nfolds=10,
params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]},
estimator=SVC,
author_name=None):
self.nfolds = nfolds
self.params = params
self.author_name = author_name if author_name else 'this author'
if estimator is SVC:
self.params['kernel'] = ['linear', 'rbf']
self.probability = True
self.classifier = estimator(probability=self.probability)
elif estimator is LinearSVC:
self.probability = False
self.classifier = estimator()
elif estimator is LogisticRegression:
self.probability = True
self.classifier = LogisticRegression()
def fit(self,X,y,groups=None):
if not isinstance(y,np.ndarray): y=np.array(y)
positive_examples = y.sum()
if positive_examples >= self.nfolds:
print('optimizing {}'.format(self.classifier.__class__.__name__))
# if groups is None or len(np.unique(groups[y==1])):
folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
# folds = list(GroupKFold(n_splits=self.nfolds).split(X,y,groups))
self.estimator = GridSearchCV(self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
else:
self.estimator = self.classifier
self.estimator.fit(X, y)
if isinstance(self.estimator, GridSearchCV):
print('Best params: {}'.format(self.estimator.best_params_))
print('computing the cross-val score')
f1scores = self.estimator.best_score_
f1_mean, f1_std = f1scores.mean(), f1scores.std()
print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores))
self.estimator = self.estimator.best_estimator_
return self
def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True, counters=False):
if groups is None:
print('Computing LOO without groups')
folds = list(LeaveOneOut().split(X,y))
else:
print('Computing LOO with groups')
logo = LeaveOneGroupOut()
folds=list(logo.split(X,y,groups))
if test_lowest_index_only:
print('ignoring fragments')
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
print(scores)
if counters and test_lowest_index_only:
yfull_true = y[:len(folds)]
yfull_predict = np.zeros_like(yfull_true)
yfull_predict[scores == 1] = yfull_true[scores == 1]
yfull_predict[scores != 1] = 1-yfull_true[scores != 1]
tp, fp, fn, tn = get_counters(yfull_true, yfull_predict)
return scores.mean(), scores.std(), tp, fp, fn, tn
else:
return scores.mean(), scores.std()
def predict(self, test, epistola_name=''):
pred = self.estimator.predict(test)
full_doc_prediction = pred[0]
print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
if len(pred) > 1:
fragment_predictions = pred[1:]
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
return full_doc_prediction, fragment_predictions
return full_doc_prediction, None
def predict_proba(self, test, epistola_name=''):
assert self.probability, 'svm is not calibrated'
pred = self.estimator.predict_proba(test)
full_doc_prediction = pred[0,1]
print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
if len(pred) > 1:
fragment_predictions = pred[1:,1]
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
return full_doc_prediction, fragment_predictions
return full_doc_prediction, None