131 lines
5.4 KiB
Python
Executable File
131 lines
5.4 KiB
Python
Executable File
from util import disable_sklearn_warnings
|
|
from sklearn.metrics import f1_score
|
|
from sklearn.metrics import make_scorer
|
|
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, GroupKFold, KFold, \
|
|
StratifiedKFold
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.svm import *
|
|
from data.features import *
|
|
|
|
class RandomVerificator:
|
|
def __init__(self): pass
|
|
def fit(self,positives,negatives):
|
|
pass
|
|
def predict(self,test):
|
|
return np.random.rand()
|
|
|
|
def get_counters(true_labels, predicted_labels):
|
|
assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels."
|
|
nd = len(true_labels)
|
|
tp = np.sum(predicted_labels[true_labels == 1])
|
|
fp = np.sum(predicted_labels[true_labels == 0])
|
|
fn = np.sum(true_labels[predicted_labels == 0])
|
|
tn = nd - (tp+fp+fn)
|
|
return tp,fp,fn,tn
|
|
|
|
def f1_from_counters(tp,fp,fn,tn):
|
|
num = 2.0 * tp
|
|
den = 2.0 * tp + fp + fn
|
|
if den > 0: return num / den
|
|
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
|
|
return 1.0
|
|
|
|
def f1(true_labels, predicted_labels):
|
|
tp, fp, fn, tn = get_counters(true_labels,predicted_labels)
|
|
return f1_from_counters(tp, fp, fn, tn )
|
|
|
|
|
|
class AuthorshipVerificator:
|
|
|
|
def __init__(self, nfolds=10,
|
|
params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]},
|
|
estimator=SVC,
|
|
author_name=None):
|
|
self.nfolds = nfolds
|
|
self.params = params
|
|
self.author_name = author_name if author_name else 'this author'
|
|
if estimator is SVC:
|
|
self.params['kernel'] = ['linear', 'rbf']
|
|
self.probability = True
|
|
self.classifier = estimator(probability=self.probability)
|
|
elif estimator is LinearSVC:
|
|
self.probability = False
|
|
self.classifier = estimator()
|
|
elif estimator is LogisticRegression:
|
|
self.probability = True
|
|
self.classifier = LogisticRegression()
|
|
|
|
def fit(self,X,y,groups=None):
|
|
if not isinstance(y,np.ndarray): y=np.array(y)
|
|
positive_examples = y.sum()
|
|
if positive_examples >= self.nfolds:
|
|
print('optimizing {}'.format(self.classifier.__class__.__name__))
|
|
# if groups is None or len(np.unique(groups[y==1])):
|
|
folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
|
|
# folds = list(GroupKFold(n_splits=self.nfolds).split(X,y,groups))
|
|
|
|
self.estimator = GridSearchCV(self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
|
else:
|
|
self.estimator = self.classifier
|
|
|
|
self.estimator.fit(X, y)
|
|
|
|
if isinstance(self.estimator, GridSearchCV):
|
|
print('Best params: {}'.format(self.estimator.best_params_))
|
|
print('computing the cross-val score')
|
|
f1scores = self.estimator.best_score_
|
|
f1_mean, f1_std = f1scores.mean(), f1scores.std()
|
|
print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores))
|
|
self.estimator = self.estimator.best_estimator_
|
|
|
|
return self
|
|
|
|
def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True, counters=False):
|
|
|
|
if groups is None:
|
|
print('Computing LOO without groups')
|
|
folds = list(LeaveOneOut().split(X,y))
|
|
else:
|
|
print('Computing LOO with groups')
|
|
logo = LeaveOneGroupOut()
|
|
folds=list(logo.split(X,y,groups))
|
|
if test_lowest_index_only:
|
|
print('ignoring fragments')
|
|
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
|
|
|
|
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
|
print(scores)
|
|
if counters and test_lowest_index_only:
|
|
yfull_true = y[:len(folds)]
|
|
yfull_predict = np.zeros_like(yfull_true)
|
|
yfull_predict[scores == 1] = yfull_true[scores == 1]
|
|
yfull_predict[scores != 1] = 1-yfull_true[scores != 1]
|
|
tp, fp, fn, tn = get_counters(yfull_true, yfull_predict)
|
|
return scores.mean(), scores.std(), tp, fp, fn, tn
|
|
else:
|
|
return scores.mean(), scores.std()
|
|
|
|
def predict(self, test, epistola_name=''):
|
|
pred = self.estimator.predict(test)
|
|
full_doc_prediction = pred[0]
|
|
print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
|
|
if len(pred) > 1:
|
|
fragment_predictions = pred[1:]
|
|
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
|
return full_doc_prediction, fragment_predictions
|
|
return full_doc_prediction, None
|
|
|
|
def predict_proba(self, test, epistola_name=''):
|
|
assert self.probability, 'svm is not calibrated'
|
|
pred = self.estimator.predict_proba(test)
|
|
full_doc_prediction = pred[0,1]
|
|
print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
|
|
if len(pred) > 1:
|
|
fragment_predictions = pred[1:,1]
|
|
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
|
return full_doc_prediction, fragment_predictions
|
|
return full_doc_prediction, None
|
|
|
|
|
|
|