from sklearn.metrics import make_scorer from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.svm import * from data.features import * from util.evaluation import f1, get_counters class AuthorshipVerificator: def __init__(self, nfolds=10, params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]}, author_name=None): self.nfolds = nfolds self.params = params self.author_name = author_name if author_name else 'this author' self.classifier = LogisticRegression() def fit(self, X, y): y = np.asarray(y) positive_examples = y.sum() if positive_examples >= self.nfolds: print('optimizing {}'.format(self.classifier.__class__.__name__)) folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y)) self.estimator = GridSearchCV( self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1 ) else: self.estimator = self.classifier self.estimator.fit(X, y) if isinstance(self.estimator, GridSearchCV): f1_mean = self.estimator.best_score_.mean() print(f'Best params: {self.estimator.best_params_} (cross-validation F1={f1_mean:.3f})') self.estimator = self.estimator.best_estimator_ return self def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False): if groups is None: print('Computing LOO without groups') folds = list(LeaveOneOut().split(X, y)) else: print('Computing LOO with groups') logo = LeaveOneGroupOut() folds = list(logo.split(X, y, groups)) if test_lowest_index_only: print('ignoring fragments') folds = [(train, np.min(test, keepdims=True)) for train, test in folds] scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1) missclassified = '\n'.join(files[scores == 0].tolist()) print('missclassified texts:') print(missclassified) if counters and test_lowest_index_only: yfull_true = y[:len(folds)] yfull_predict = np.zeros_like(yfull_true) yfull_predict[scores == 1] = yfull_true[scores == 1] yfull_predict[scores != 1] = 1-yfull_true[scores != 1] tp, fp, fn, tn = get_counters(yfull_true, yfull_predict) return scores.mean(), scores.std(), tp, fp, fn, tn else: return scores.mean(), scores.std() def predict(self, test): pred = self.estimator.predict(test) full_doc_prediction = pred[0] if len(pred) > 1: fragment_predictions = pred[1:] print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) return full_doc_prediction, fragment_predictions return full_doc_prediction def predict_proba(self, test): assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated' pred = self.estimator.predict_proba(test) full_doc_prediction = pred[0,1] if len(pred) > 1: fragment_predictions = pred[1:,1] print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) return full_doc_prediction, fragment_predictions return full_doc_prediction, []