dante-verification/src/model.py

from util import disable_sklearn_warnings
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, GroupKFold, KFold, \
    StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import *
from data.features import *

class RandomVerificator:
    def __init__(self): pass
    def fit(self,positives,negatives):
        pass
    def predict(self,test):
        return np.random.rand()

def get_counters(true_labels, predicted_labels):
    assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels."
    nd = len(true_labels)
    tp = np.sum(predicted_labels[true_labels == 1])
    fp = np.sum(predicted_labels[true_labels == 0])
    fn = np.sum(true_labels[predicted_labels == 0])
    tn = nd - (tp+fp+fn)
    return tp,fp,fn,tn

def f1_from_counters(tp,fp,fn,tn):
    num = 2.0 * tp
    den = 2.0 * tp + fp + fn
    if den > 0: return num / den
    # we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
    return 1.0

def f1(true_labels, predicted_labels):
    tp, fp, fn, tn = get_counters(true_labels,predicted_labels)
    return f1_from_counters(tp, fp, fn, tn )


class AuthorshipVerificator:

    def __init__(self, nfolds=10,
                 params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]},
                 estimator=SVC,
                 author_name=None):
        self.nfolds = nfolds
        self.params = params
        self.author_name = author_name if author_name else 'this author'
        if estimator is SVC:
            self.params['kernel'] = ['linear', 'rbf']
            self.probability = True
            self.classifier = estimator(probability=self.probability)
        elif estimator is LinearSVC:
            self.probability = False
            self.classifier = estimator()
        elif estimator is LogisticRegression:
            self.probability = True
            self.classifier = LogisticRegression()

    def fit(self,X,y,groups=None):
        if not isinstance(y,np.ndarray): y=np.array(y)
        positive_examples = y.sum()
        if positive_examples >= self.nfolds:
            print('optimizing {}'.format(self.classifier.__class__.__name__))
            # if groups is None or len(np.unique(groups[y==1])):
            folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
            # folds = list(GroupKFold(n_splits=self.nfolds).split(X,y,groups))

            self.estimator = GridSearchCV(self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
        else:
            self.estimator = self.classifier

        self.estimator.fit(X, y)

        if isinstance(self.estimator, GridSearchCV):
            print('Best params: {}'.format(self.estimator.best_params_))
            print('computing the cross-val score')
            f1scores = self.estimator.best_score_
            f1_mean, f1_std = f1scores.mean(), f1scores.std()
            print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores))
            self.estimator = self.estimator.best_estimator_

        return self

    def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True, counters=False):

        if groups is None:
            print('Computing LOO without groups')
            folds = list(LeaveOneOut().split(X,y))
        else:
            print('Computing LOO with groups')
            logo = LeaveOneGroupOut()
            folds=list(logo.split(X,y,groups))
            if test_lowest_index_only:
                print('ignoring fragments')
                folds = [(train, np.min(test, keepdims=True)) for train, test in folds]

        scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
        print(scores)
        if counters and test_lowest_index_only:
            yfull_true = y[:len(folds)]
            yfull_predict = np.zeros_like(yfull_true)
            yfull_predict[scores == 1] = yfull_true[scores == 1]
            yfull_predict[scores != 1] = 1-yfull_true[scores != 1]
            tp, fp, fn, tn = get_counters(yfull_true, yfull_predict)
            return scores.mean(), scores.std(), tp, fp, fn, tn
        else:
            return scores.mean(), scores.std()

    def predict(self, test, epistola_name=''):
        pred = self.estimator.predict(test)
        full_doc_prediction = pred[0]
        print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
        if len(pred) > 1:
            fragment_predictions = pred[1:]
            print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
            return full_doc_prediction, fragment_predictions
        return full_doc_prediction, None

    def predict_proba(self, test, epistola_name=''):
        assert self.probability, 'svm is not calibrated'
        pred = self.estimator.predict_proba(test)
        full_doc_prediction = pred[0,1]
        print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
        if len(pred) > 1:
            fragment_predictions = pred[1:,1]
            print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
            return full_doc_prediction, fragment_predictions
        return full_doc_prediction, None