From 4c6c92f6d4eb0641cbc39e7744f514f371714504 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Fri, 20 Nov 2020 21:41:47 +0100 Subject: [PATCH] improving experimental protocol --- src/author_identification.py | 21 ++-- src/author_identification_loo.py | 134 ++++++++++++++++++++++++ src/author_identification_unknown.py | 146 +++++++++++++++++++++++++++ src/data/features.py | 21 +++- src/model.py | 113 ++++++++++++--------- 5 files changed, 378 insertions(+), 57 deletions(-) create mode 100755 src/author_identification_loo.py create mode 100755 src/author_identification_unknown.py diff --git a/src/author_identification.py b/src/author_identification.py index 6c984cb..ff1c894 100755 --- a/src/author_identification.py +++ b/src/author_identification.py @@ -1,10 +1,8 @@ import util._hide_sklearn_warnings -from sklearn.linear_model import LogisticRegression from data.dante_loader import load_latin_corpus, list_authors from data.features import * from model import AuthorshipVerificator from util.evaluation import f1_from_counters -from sklearn.calibration import CalibratedClassifierCV import argparse AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna'] @@ -15,6 +13,8 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] +DEBUG_MODE=True + def main(): log = open(args.log, 'wt') discarded = 0 @@ -44,7 +44,7 @@ def main(): conjugations_freq='latin', features_Mendenhall=True, features_sentenceLengths=True, - feature_selection_ratio=0.1, + feature_selection_ratio=0.1 if DEBUG_MODE else 1, wordngrams=True, n_wordngrams=(1, 2), charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False, @@ -58,22 +58,23 @@ def main(): print('Fitting the Verificator') if args.C is None: - params = {'C': np.logspace(-3, +3, 7)} + params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)} C = 1. else: params = None C = args.C - av = AuthorshipVerificator(C=C, params=params) - av.fit(Xtr, ytr) - if args.unknown: + av = AuthorshipVerificator(C=C, param_grid=params) + av.fit(Xtr, ytr) + print(f'Checking for the hypothesis that {author} was the author of {args.unknown}') ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3) - pred, _ = av.predict_proba(ep) + pred, _ = av.predict_proba_with_fragments(ep) tee(f'{args.unknown}: Posterior probability for {author} is {pred:.3f}', log) if args.loo: + av = AuthorshipVerificator(C=C, param_grid=params) print('Validating the Verificator (Leave-One-Out)') score_ave, score_std, tp, fp, fn, tn = av.leave_one_out( Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True @@ -97,6 +98,9 @@ def main(): log.close() + if DEBUG_MODE: + print('DEBUG_MODE ON') + def tee(msg, log): print(msg) @@ -139,3 +143,4 @@ if __name__ == '__main__': assert args.unknown is None or os.path.exists(args.unknown), '"unknown file" does not exist' main() + diff --git a/src/author_identification_loo.py b/src/author_identification_loo.py new file mode 100755 index 0000000..023315a --- /dev/null +++ b/src/author_identification_loo.py @@ -0,0 +1,134 @@ +#import util._hide_sklearn_warnings +from data.dante_loader import load_latin_corpus, list_authors +from data.features import * +from model import AuthorshipVerificator, RangeFeatureSelector, leave_one_out +from util.evaluation import f1_from_counters +import argparse +from sklearn.pipeline import Pipeline + +AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna'] +AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis', + 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', 'GrazioloBambaglioli', 'GuidoDaPisa', + 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', 'IohannesDePlanoCarpini', + 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus', + 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] + + +DEBUG_MODE = False + + +def main(): + log = open(args.log, 'wt') + discarded = 0 + f1_scores = [] + counters = [] + for i, author in enumerate(args.authors): + path = args.corpuspath + print('='*80) + print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})') + print(f'Corpus {path}') + print('-'*80) + + positive, negative, pos_files, neg_files, ep_text = load_latin_corpus(path, positive_author=author) + files = np.asarray(pos_files + neg_files) + if len(positive) < 2: + discarded += 1 + print(f'discarding analysis for {author} which has only {len(positive)} documents') + continue + + n_full_docs = len(positive) + len(negative) + print(f'read {n_full_docs} documents from {path}') + + feature_extractor = FeatureExtractor( + function_words_freq='latin', + conjugations_freq='latin', + features_Mendenhall=True, + features_sentenceLengths=True, + feature_selection_ratio=0.05 if DEBUG_MODE else 1, + wordngrams=True, n_wordngrams=(1, 2), + charngrams=True, n_charngrams=(3, 4, 5), + preserve_punctuation=False, + split_documents=True, + split_policy=split_by_sentences, + window_size=3, + normalize_features=True + ) + + Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) + + print('Fitting the Verificator') + params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)} + + slice_charngrams = feature_extractor.feature_range['_cngrams_task'] + slice_wordngrams = feature_extractor.feature_range['_wngrams_task'] + if slice_charngrams.start < slice_wordngrams.start: + slice_first, slice_second = slice_charngrams, slice_wordngrams + else: + slice_first, slice_second = slice_wordngrams, slice_charngrams + av = Pipeline([ + ('featsel_cngrams', RangeFeatureSelector(slice_second, 0.1)), + ('featsel_wngrams', RangeFeatureSelector(slice_first, 0.1)), + ('av', AuthorshipVerificator(C=1, param_grid=params)) + ]) + + print('Validating the Verificator (Leave-One-Out)') + score_ave, score_std, tp, fp, fn, tn = leave_one_out( + av, Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True + ) + f1_scores.append(f1_from_counters(tp, fp, fn, tn)) + counters.append((tp, fp, fn, tn)) + tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log) + print(f'TP={tp} FP={fp} FN={fn} TN={tn}') + + print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})') + f1_scores = np.array(f1_scores) + counters = np.array(counters) + + macro_f1 = f1_scores.mean() + micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist()) + + tee(f'LOO Macro-F1 = {macro_f1:.3f}', log) + tee(f'LOO Micro-F1 = {micro_f1:.3f}', log) + print() + + log.close() + + if DEBUG_MODE: + print('DEBUG_MODE ON') + + +def tee(msg, log): + print(msg) + log.write(f'{msg}\n') + log.flush() + + +if __name__ == '__main__': + import os + + # Training settings + parser = argparse.ArgumentParser(description='Authorship verification for MedLatin ' + 'submit each binary classifier to leave-one-out validation') + parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH', + help=f'Path to the directory containing the corpus (documents must be named ' + f'_.txt)') + parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR', + help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check ' + f'every author') + parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt', + help='path to the log file where to write the results (default ./results.txt)') + + args = parser.parse_args() + + if args.positive == 'ALL': + args.authors = list_authors(args.corpuspath, skip_prefix='Epistola') + else: + if (args.positive not in AUTHORS_CORPUS_I) and (args.positive in AUTHORS_CORPUS_II): + print(f'warning: author {args.positive} is not in the known list of authors for CORPUS I nor CORPUS II') + assert args.positive in list_authors(args.corpuspath, skip_prefix='Epistola'), 'unexpected author' + args.authors = [args.positive] + + assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist' + + main() + diff --git a/src/author_identification_unknown.py b/src/author_identification_unknown.py new file mode 100755 index 0000000..ff1c894 --- /dev/null +++ b/src/author_identification_unknown.py @@ -0,0 +1,146 @@ +import util._hide_sklearn_warnings +from data.dante_loader import load_latin_corpus, list_authors +from data.features import * +from model import AuthorshipVerificator +from util.evaluation import f1_from_counters +import argparse + +AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna'] +AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis', + 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', 'GrazioloBambaglioli', 'GuidoDaPisa', + 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', 'IohannesDePlanoCarpini', + 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus', + 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] + + +DEBUG_MODE=True + +def main(): + log = open(args.log, 'wt') + discarded = 0 + f1_scores = [] + counters = [] + for i, author in enumerate(args.authors): + path = args.corpuspath + print('='*80) + print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})') + print(f'Corpus {path}') + print('-'*80) + + positive, negative, pos_files, neg_files, ep_text = load_latin_corpus( + path, positive_author=author, unknown_target=args.unknown + ) + files = np.asarray(pos_files + neg_files) + if len(positive) < 2: + discarded += 1 + print(f'discarding analysis for {author} which has only {len(positive)} documents') + continue + + n_full_docs = len(positive) + len(negative) + print(f'read {n_full_docs} documents from {path}') + + feature_extractor = FeatureExtractor( + function_words_freq='latin', + conjugations_freq='latin', + features_Mendenhall=True, + features_sentenceLengths=True, + feature_selection_ratio=0.1 if DEBUG_MODE else 1, + wordngrams=True, n_wordngrams=(1, 2), + charngrams=True, n_charngrams=(3, 4, 5), + preserve_punctuation=False, + split_documents=True, + split_policy=split_by_sentences, + window_size=3, + normalize_features=True + ) + + Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) + + print('Fitting the Verificator') + if args.C is None: + params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)} + C = 1. + else: + params = None + C = args.C + + if args.unknown: + av = AuthorshipVerificator(C=C, param_grid=params) + av.fit(Xtr, ytr) + + print(f'Checking for the hypothesis that {author} was the author of {args.unknown}') + ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3) + pred, _ = av.predict_proba_with_fragments(ep) + tee(f'{args.unknown}: Posterior probability for {author} is {pred:.3f}', log) + + if args.loo: + av = AuthorshipVerificator(C=C, param_grid=params) + print('Validating the Verificator (Leave-One-Out)') + score_ave, score_std, tp, fp, fn, tn = av.leave_one_out( + Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True + ) + f1_scores.append(f1_from_counters(tp, fp, fn, tn)) + counters.append((tp, fp, fn, tn)) + tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log) + print(f'TP={tp} FP={fp} FN={fn} TN={tn}') + + if args.loo: + print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})') + f1_scores = np.array(f1_scores) + counters = np.array(counters) + + macro_f1 = f1_scores.mean() + micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist()) + + tee(f'LOO Macro-F1 = {macro_f1:.3f}', log) + tee(f'LOO Micro-F1 = {micro_f1:.3f}', log) + print() + + log.close() + + if DEBUG_MODE: + print('DEBUG_MODE ON') + + +def tee(msg, log): + print(msg) + log.write(f'{msg}\n') + log.flush() + + +if __name__ == '__main__': + import os + + # Training settings + parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII') + parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH', + help=f'Path to the directory containing the corpus (documents must be named ' + f'_.txt)') + parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR', + help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check ' + f'every author') + parser.add_argument('--loo', default=False, action='store_true', + help='submit each binary classifier to leave-one-out validation') + parser.add_argument('--unknown', type=str, metavar='PATH', default=None, + help='path to the file of unknown paternity (default None)') + parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt', + help='path to the log file where to write the results (default ./results.txt)') + parser.add_argument('--C', type=float, metavar='C', default=None, + help='set the parameter C (trade off between error and margin) or leave as None to optimize') + + args = parser.parse_args() + + if args.positive == 'ALL': + args.authors = list_authors(args.corpuspath, skip_prefix='Epistola') + else: + if (args.positive not in AUTHORS_CORPUS_I) and (args.positive in AUTHORS_CORPUS_II): + print(f'warning: author {args.positive} is not in the known list of authors for CORPUS I nor CORPUS II') + assert args.positive in list_authors(args.corpuspath, skip_prefix='Epistola'), 'unexpected author' + args.authors = [args.positive] + + assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.' + assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist' + assert args.unknown is None or os.path.exists(args.unknown), '"unknown file" does not exist' + + main() + diff --git a/src/data/features.py b/src/data/features.py index 8c9c4c6..f1bef2f 100755 --- a/src/data/features.py +++ b/src/data/features.py @@ -367,6 +367,7 @@ class FeatureExtractor: self.feature_names = None self.wngrams_vectorizer = self.wngrams_selector = None self.cngrams_vectorizer = self.cngrams_selector = None + self.feature_range = {} def fit_transform(self, positives, negatives): documents = positives + negatives @@ -423,11 +424,15 @@ class FeatureExtractor: else: return TEST - def _addfeatures(self, X, F, feat_names=None): + def _addfeatures(self, X, F, feat_set_name, feat_names=None): if self.normalize_features: normalize(F, axis=1, copy=False) self._register_feature_names(feat_names) + last_col, n_cols = X.shape[1], F.shape[1] + self.feature_range[feat_set_name] = slice(last_col, last_col+n_cols) + print('adding feat-set slice ', feat_set_name, self.feature_range[feat_set_name]) + if issparse(F): return hstack((X, F)) # sparse else: @@ -445,6 +450,16 @@ class FeatureExtractor: self.feature_names = [] self.feature_names.extend(feat_names) + def get_feature_set(self, X, name): + assert name in self.feature_range, 'unknown feature set name' + return X[:,self.feature_range[name]] + + def get_feature_set_names(self): + return list(self.feature_range.keys()) + + def get_feature_names(self): + return self.feature_names + def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1): # initialize the document-by-feature vector X = np.empty((len(documents), 0)) @@ -501,9 +516,9 @@ class FeatureExtractor: for out in outs: taskname = out['task'] if taskname not in {'_wngrams_task', '_cngrams_task'}: - X = self._addfeatures(X, out['features'], out['f_names'] if fit else None) + X = self._addfeatures(X, out['features'], taskname, out['f_names'] if fit else None) else: - X = self._addfeatures(_tocsr(X), out['features'], out['f_names'] if fit else None) + X = self._addfeatures(_tocsr(X), out['features'], taskname, out['f_names'] if fit else None) if fit: vectorizer, selector = out['vectorizer'], out['selector'] if taskname == '_wngrams_task' and self.wngrams_vectorizer is None: diff --git a/src/model.py b/src/model.py index 0fc17e4..45b82f0 100755 --- a/src/model.py +++ b/src/model.py @@ -1,30 +1,32 @@ +from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import make_scorer from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold from sklearn.linear_model import LogisticRegression from data.features import * -from util.calibration import CalibratedClassifierCV from util.evaluation import f1, get_counters -class AuthorshipVerificator: +class AuthorshipVerificator(BaseEstimator): - def __init__(self, nfolds=10, - params={'C': np.logspace(-4, +3, 8)}, + def __init__(self, + nfolds=10, + param_grid={'C': np.logspace(-4, +3, 8)}, C=1., author_name=None): self.nfolds = nfolds - self.params = params - self.author_name = author_name if author_name else 'this author' - self.classifier = LogisticRegression(C=C, class_weight='balanced') + self.param_grid = param_grid + self.C = C + self.author_name = author_name def fit(self, X, y): + self.classifier = LogisticRegression(C=self.C, class_weight='balanced') y = np.asarray(y) positive_examples = y.sum() - if positive_examples >= self.nfolds and self.params is not None: + if positive_examples >= self.nfolds and self.param_grid is not None: print('optimizing {}'.format(self.classifier.__class__.__name__)) folds = list(StratifiedKFold(n_splits=self.nfolds, shuffle=True, random_state=42).split(X, y)) self.estimator = GridSearchCV( - self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1 + self.classifier, param_grid=self.param_grid, cv=folds, scoring=make_scorer(f1), n_jobs=-1 ) else: self.estimator = self.classifier @@ -36,44 +38,9 @@ class AuthorshipVerificator: print(f'Best params: {self.estimator.best_params_} (cross-validation F1={f1_mean:.3f})') self.estimator = self.estimator.best_estimator_ - #self.estimator = CalibratedClassifierCV(base_estimator=self.estimator, cv=self.nfolds, ensemble=False) - #self.estimator.fit(X, y) - return self - def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False): - if groups is None: - print('Computing LOO without groups') - folds = list(LeaveOneOut().split(X, y)) - else: - print('Computing LOO with groups') - logo = LeaveOneGroupOut() - folds = list(logo.split(X, y, groups)) - if test_lowest_index_only: - print('ignoring fragments') - folds = [(train, np.min(test, keepdims=True)) for train, test in folds] - - scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1) - missclassified = files[scores == 0].tolist() - #if hasattr(self.estimator, 'predict_proba') and len(missclassified) > 0: - # missclassified_prob = self.estimator.predict_proba(csr_matrix(X)[scores == 0])[:, 1] - # missclassified_prob = missclassified_prob.flatten().tolist() - # missclassified = [f'{file} Pr={prob:.3f}' for file, prob in zip(missclassified,missclassified_prob)] - print('missclassified texts:') - print('\n'.join(missclassified)) - - - if counters and test_lowest_index_only: - yfull_true = y[:len(folds)] - yfull_predict = np.zeros_like(yfull_true) - yfull_predict[scores == 1] = yfull_true[scores == 1] - yfull_predict[scores != 1] = 1-yfull_true[scores != 1] - tp, fp, fn, tn = get_counters(yfull_true, yfull_predict) - return scores.mean(), scores.std(), tp, fp, fn, tn - else: - return scores.mean(), scores.std() - - def predict(self, test): + def predict_with_fragments(self, test): pred = self.estimator.predict(test) full_doc_prediction = pred[0] if len(pred) > 1: @@ -82,7 +49,10 @@ class AuthorshipVerificator: return full_doc_prediction, fragment_predictions return full_doc_prediction - def predict_proba(self, test): + def predict(self, test): + return self.estimator.predict(test) + + def predict_proba_with_fragments(self, test): assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated' pred = self.estimator.predict_proba(test) full_doc_prediction = pred[0,1] @@ -92,5 +62,56 @@ class AuthorshipVerificator: return full_doc_prediction, fragment_predictions return full_doc_prediction, [] + def predict_proba(self, test): + assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated' + return self.estimator.predict_proba(test) +def leave_one_out(model, X, y, files, groups=None, test_lowest_index_only=True, counters=False): + if groups is None: + print(f'Computing LOO without groups over {X.shape[0]} documents') + folds = list(LeaveOneOut().split(X, y)) + else: + print(f'Computing LOO with groups over {X.shape[0]} documents') + logo = LeaveOneGroupOut() + folds = list(logo.split(X, y, groups)) + if test_lowest_index_only: + print('ignoring fragments') + folds = [(train, np.min(test, keepdims=True)) for train, test in folds] + + print(f'optimizing via grid search each o the {len(folds)} prediction problems') + scores = cross_val_score(model, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1, verbose=10) + missclassified = files[scores == 0].tolist() + #if hasattr(self.estimator, 'predict_proba') and len(missclassified) > 0: + # missclassified_prob = self.estimator.predict_proba(csr_matrix(X)[scores == 0])[:, 1] + # missclassified_prob = missclassified_prob.flatten().tolist() + # missclassified = [f'{file} Pr={prob:.3f}' for file, prob in zip(missclassified,missclassified_prob)] + print('missclassified texts:') + print('\n'.join(missclassified)) + + if counters and test_lowest_index_only: + yfull_true = y[:len(folds)] + yfull_predict = np.zeros_like(yfull_true) + yfull_predict[scores == 1] = yfull_true[scores == 1] + yfull_predict[scores != 1] = 1-yfull_true[scores != 1] + tp, fp, fn, tn = get_counters(yfull_true, yfull_predict) + return scores.mean(), scores.std(), tp, fp, fn, tn + else: + return scores.mean(), scores.std() + + +class RangeFeatureSelector(BaseEstimator, TransformerMixin): + def __init__(self, range: slice, feat_sel_ratio: float): + self.range = range + self.feat_sel_ratio = feat_sel_ratio + + def fit(self, X, y): + nF = self.range.stop-self.range.start + num_feats = int(self.feat_sel_ratio * nF) + self.selector = SelectKBest(chi2, k=num_feats) + self.selector.fit(X[:,self.range], y) + return self + + def transform(self, X): + Z = self.selector.transform(X[:,self.range]) + return csr_matrix(hstack([X[:,:self.range.start], Z, X[:,self.range.stop:]]))