from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import make_scorer from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold from sklearn.linear_model import LogisticRegression from data.features import * from util.evaluation import f1, get_counters class AuthorshipVerificator(BaseEstimator): def __init__(self, nfolds=10, param_grid={'C': np.logspace(-4, +3, 8)}, C=1., author_name=None): self.nfolds = nfolds self.param_grid = param_grid self.C = C self.author_name = author_name def fit(self, X, y): self.classifier = LogisticRegression(C=self.C, class_weight='balanced') y = np.asarray(y) positive_examples = y.sum() if positive_examples >= self.nfolds and self.param_grid is not None: print('optimizing {}'.format(self.classifier.__class__.__name__)) folds = list(StratifiedKFold(n_splits=self.nfolds, shuffle=True, random_state=42).split(X, y)) self.estimator = GridSearchCV( self.classifier, param_grid=self.param_grid, cv=folds, scoring=make_scorer(f1), n_jobs=-1 ) else: self.estimator = self.classifier self.estimator.fit(X, y) if isinstance(self.estimator, GridSearchCV): f1_mean = self.estimator.best_score_.mean() print(f'Best params: {self.estimator.best_params_} (cross-validation F1={f1_mean:.3f})') self.estimator = self.estimator.best_estimator_ return self def predict_with_fragments(self, test): pred = self.estimator.predict(test) full_doc_prediction = pred[0] if len(pred) > 1: fragment_predictions = pred[1:] print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) return full_doc_prediction, fragment_predictions return full_doc_prediction def predict(self, test): return self.estimator.predict(test) def predict_proba_with_fragments(self, test): assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated' pred = self.estimator.predict_proba(test) full_doc_prediction = pred[0,1] if len(pred) > 1: fragment_predictions = pred[1:,1] print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) return full_doc_prediction, fragment_predictions return full_doc_prediction, [] def predict_proba(self, test): assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated' return self.estimator.predict_proba(test) def leave_one_out(model, X, y, files, groups=None, test_lowest_index_only=True, counters=False): if groups is None: print(f'Computing LOO without groups over {X.shape[0]} documents') folds = list(LeaveOneOut().split(X, y)) else: print(f'Computing LOO with groups over {X.shape[0]} documents') logo = LeaveOneGroupOut() folds = list(logo.split(X, y, groups)) if test_lowest_index_only: print('ignoring fragments') folds = [(train, np.min(test, keepdims=True)) for train, test in folds] print(f'optimizing via grid search each o the {len(folds)} prediction problems') scores = cross_val_score(model, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1, verbose=10) missclassified = files[scores == 0].tolist() #if hasattr(self.estimator, 'predict_proba') and len(missclassified) > 0: # missclassified_prob = self.estimator.predict_proba(csr_matrix(X)[scores == 0])[:, 1] # missclassified_prob = missclassified_prob.flatten().tolist() # missclassified = [f'{file} Pr={prob:.3f}' for file, prob in zip(missclassified,missclassified_prob)] print('missclassified texts:') print('\n'.join(missclassified)) if counters and test_lowest_index_only: yfull_true = y[:len(folds)] yfull_predict = np.zeros_like(yfull_true) yfull_predict[scores == 1] = yfull_true[scores == 1] yfull_predict[scores != 1] = 1-yfull_true[scores != 1] tp, fp, fn, tn = get_counters(yfull_true, yfull_predict) return scores.mean(), scores.std(), tp, fp, fn, tn else: return scores.mean(), scores.std() class RangeFeatureSelector(BaseEstimator, TransformerMixin): def __init__(self, range: slice, feat_sel_ratio: float): self.range = range self.feat_sel_ratio = feat_sel_ratio def fit(self, X, y): nF = self.range.stop-self.range.start num_feats = int(self.feat_sel_ratio * nF) self.selector = SelectKBest(chi2, k=num_feats) self.selector.fit(X[:,self.range], y) return self def transform(self, X): Z = self.selector.transform(X[:,self.range]) return csr_matrix(hstack([X[:,:self.range.start], Z, X[:,self.range.stop:]]))