improving experimental protocol

2020-11-20 21:41:47 +01:00 · 2020-11-20 21:41:47 +01:00 · 4c6c92f6d4
parent 121ecadfcb
commit 4c6c92f6d4
5 changed files with 378 additions and 57 deletions
--- a/src/author_identification.py
+++ b/src/author_identification.py
@ -1,10 +1,8 @@
 import util._hide_sklearn_warnings
 from sklearn.linear_model import LogisticRegression
 from data.dante_loader import load_latin_corpus, list_authors
 from data.features import *
 from model import AuthorshipVerificator
 from util.evaluation import f1_from_counters
 from sklearn.calibration import CalibratedClassifierCV
 import argparse
 AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
@ -15,6 +13,8 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn
                           'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
 DEBUG_MODE=True
 def main():
    log = open(args.log, 'wt')
    discarded = 0
@ -44,7 +44,7 @@ def main():
            conjugations_freq='latin',
            features_Mendenhall=True,
            features_sentenceLengths=True,
-            feature_selection_ratio=0.1,
+            feature_selection_ratio=0.1 if DEBUG_MODE else 1,
            wordngrams=True, n_wordngrams=(1, 2),
            charngrams=True, n_charngrams=(3, 4, 5),
            preserve_punctuation=False,
@ -58,22 +58,23 @@ def main():
        print('Fitting the Verificator')
        if args.C is None:
-            params = {'C': np.logspace(-3, +3, 7)}
+            params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)}
            C = 1.
        else:
            params = None
            C = args.C
        av = AuthorshipVerificator(C=C, params=params)
        av.fit(Xtr, ytr)
        if args.unknown:
            av = AuthorshipVerificator(C=C, param_grid=params)
            av.fit(Xtr, ytr)
            print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
            ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
-            pred, _ = av.predict_proba(ep)
+            pred, _ = av.predict_proba_with_fragments(ep)
            tee(f'{args.unknown}: Posterior probability for {author} is {pred:.3f}', log)
        if args.loo:
            av = AuthorshipVerificator(C=C, param_grid=params)
            print('Validating the Verificator (Leave-One-Out)')
            score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(
                Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
@ -97,6 +98,9 @@ def main():
    log.close()
    if DEBUG_MODE:
        print('DEBUG_MODE ON')
 def tee(msg, log):
    print(msg)
@ -139,3 +143,4 @@ if __name__ == '__main__':
    assert args.unknown is None or os.path.exists(args.unknown), '"unknown file" does not exist'
    main()
--- a/src/author_identification_loo.py
+++ b/src/author_identification_loo.py
@ -0,0 +1,134 @@
 #import util._hide_sklearn_warnings
 from data.dante_loader import load_latin_corpus, list_authors
 from data.features import *
 from model import AuthorshipVerificator, RangeFeatureSelector, leave_one_out
 from util.evaluation import f1_from_counters
 import argparse
 from sklearn.pipeline import Pipeline
 AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
 AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
                           'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', 'GrazioloBambaglioli', 'GuidoDaPisa',
                           'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', 'IohannesDePlanoCarpini',
                           'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus',
                           'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
 DEBUG_MODE = False
 def main():
    log = open(args.log, 'wt')
    discarded = 0
    f1_scores = []
    counters = []
    for i, author in enumerate(args.authors):
        path = args.corpuspath
        print('='*80)
        print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})')
        print(f'Corpus {path}')
        print('-'*80)
        positive, negative, pos_files, neg_files, ep_text = load_latin_corpus(path, positive_author=author)
        files = np.asarray(pos_files + neg_files)
        if len(positive) < 2:
            discarded += 1
            print(f'discarding analysis for {author} which has only {len(positive)} documents')
            continue
        n_full_docs = len(positive) + len(negative)
        print(f'read {n_full_docs} documents from {path}')
        feature_extractor = FeatureExtractor(
            function_words_freq='latin',
            conjugations_freq='latin',
            features_Mendenhall=True,
            features_sentenceLengths=True,
            feature_selection_ratio=0.05 if DEBUG_MODE else 1,
            wordngrams=True, n_wordngrams=(1, 2),
            charngrams=True, n_charngrams=(3, 4, 5),
            preserve_punctuation=False,
            split_documents=True,
            split_policy=split_by_sentences,
            window_size=3,
            normalize_features=True
        )
        Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
        print('Fitting the Verificator')
        params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)}
        slice_charngrams = feature_extractor.feature_range['_cngrams_task']
        slice_wordngrams = feature_extractor.feature_range['_wngrams_task']
        if slice_charngrams.start < slice_wordngrams.start:
            slice_first, slice_second = slice_charngrams, slice_wordngrams
        else:
            slice_first, slice_second = slice_wordngrams, slice_charngrams
        av = Pipeline([
            ('featsel_cngrams', RangeFeatureSelector(slice_second, 0.1)),
            ('featsel_wngrams', RangeFeatureSelector(slice_first, 0.1)),
            ('av', AuthorshipVerificator(C=1, param_grid=params))
        ])
        print('Validating the Verificator (Leave-One-Out)')
        score_ave, score_std, tp, fp, fn, tn = leave_one_out(
            av, Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
        )
        f1_scores.append(f1_from_counters(tp, fp, fn, tn))
        counters.append((tp, fp, fn, tn))
        tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log)
        print(f'TP={tp} FP={fp} FN={fn} TN={tn}')
    print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
    f1_scores = np.array(f1_scores)
    counters = np.array(counters)
    macro_f1 = f1_scores.mean()
    micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
    tee(f'LOO Macro-F1 = {macro_f1:.3f}', log)
    tee(f'LOO Micro-F1 = {micro_f1:.3f}', log)
    print()
    log.close()
    if DEBUG_MODE:
        print('DEBUG_MODE ON')
 def tee(msg, log):
    print(msg)
    log.write(f'{msg}\n')
    log.flush()
 if __name__ == '__main__':
    import os
    # Training settings
    parser = argparse.ArgumentParser(description='Authorship verification for MedLatin '
                                                 'submit each binary classifier to leave-one-out validation')
    parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH',
                        help=f'Path to the directory containing the corpus (documents must be named '
                             f'<author>_<texname>.txt)')
    parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
                        help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
                              f'every author')
    parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt',
                        help='path to the log file where to write the results (default ./results.txt)')
    args = parser.parse_args()
    if args.positive == 'ALL':
        args.authors = list_authors(args.corpuspath, skip_prefix='Epistola')
    else:
        if (args.positive not in AUTHORS_CORPUS_I) and (args.positive in AUTHORS_CORPUS_II):
            print(f'warning: author {args.positive} is not in the known list of authors for CORPUS I nor CORPUS II')
        assert args.positive in list_authors(args.corpuspath, skip_prefix='Epistola'), 'unexpected author'
        args.authors = [args.positive]
    assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist'
    main()
--- a/src/author_identification_unknown.py
+++ b/src/author_identification_unknown.py
@ -0,0 +1,146 @@
 import util._hide_sklearn_warnings
 from data.dante_loader import load_latin_corpus, list_authors
 from data.features import *
 from model import AuthorshipVerificator
 from util.evaluation import f1_from_counters
 import argparse
 AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
 AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
                           'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', 'GrazioloBambaglioli', 'GuidoDaPisa',
                           'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', 'IohannesDePlanoCarpini',
                           'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus',
                           'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
 DEBUG_MODE=True
 def main():
    log = open(args.log, 'wt')
    discarded = 0
    f1_scores = []
    counters = []
    for i, author in enumerate(args.authors):
        path = args.corpuspath
        print('='*80)
        print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})')
        print(f'Corpus {path}')
        print('-'*80)
        positive, negative, pos_files, neg_files, ep_text = load_latin_corpus(
            path, positive_author=author, unknown_target=args.unknown
        )
        files = np.asarray(pos_files + neg_files)
        if len(positive) < 2:
            discarded += 1
            print(f'discarding analysis for {author} which has only {len(positive)} documents')
            continue
        n_full_docs = len(positive) + len(negative)
        print(f'read {n_full_docs} documents from {path}')
        feature_extractor = FeatureExtractor(
            function_words_freq='latin',
            conjugations_freq='latin',
            features_Mendenhall=True,
            features_sentenceLengths=True,
            feature_selection_ratio=0.1 if DEBUG_MODE else 1,
            wordngrams=True, n_wordngrams=(1, 2),
            charngrams=True, n_charngrams=(3, 4, 5),
            preserve_punctuation=False,
            split_documents=True,
            split_policy=split_by_sentences,
            window_size=3,
            normalize_features=True
        )
        Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
        print('Fitting the Verificator')
        if args.C is None:
            params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)}
            C = 1.
        else:
            params = None
            C = args.C
        if args.unknown:
            av = AuthorshipVerificator(C=C, param_grid=params)
            av.fit(Xtr, ytr)
            print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
            ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
            pred, _ = av.predict_proba_with_fragments(ep)
            tee(f'{args.unknown}: Posterior probability for {author} is {pred:.3f}', log)
        if args.loo:
            av = AuthorshipVerificator(C=C, param_grid=params)
            print('Validating the Verificator (Leave-One-Out)')
            score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(
                Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
            )
            f1_scores.append(f1_from_counters(tp, fp, fn, tn))
            counters.append((tp, fp, fn, tn))
            tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log)
            print(f'TP={tp} FP={fp} FN={fn} TN={tn}')
    if args.loo:
        print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
        f1_scores = np.array(f1_scores)
        counters = np.array(counters)
        macro_f1 = f1_scores.mean()
        micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
        tee(f'LOO Macro-F1 = {macro_f1:.3f}', log)
        tee(f'LOO Micro-F1 = {micro_f1:.3f}', log)
        print()
    log.close()
    if DEBUG_MODE:
        print('DEBUG_MODE ON')
 def tee(msg, log):
    print(msg)
    log.write(f'{msg}\n')
    log.flush()
 if __name__ == '__main__':
    import os
    # Training settings
    parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII')
    parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH',
                        help=f'Path to the directory containing the corpus (documents must be named '
                             f'<author>_<texname>.txt)')
    parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
                        help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
                              f'every author')
    parser.add_argument('--loo', default=False, action='store_true',
                        help='submit each binary classifier to leave-one-out validation')
    parser.add_argument('--unknown', type=str, metavar='PATH', default=None,
                        help='path to the file of unknown paternity (default None)')
    parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt',
                        help='path to the log file where to write the results (default ./results.txt)')
    parser.add_argument('--C', type=float, metavar='C', default=None,
                        help='set the parameter C (trade off between error and margin) or leave as None to optimize')
    args = parser.parse_args()
    if args.positive == 'ALL':
        args.authors = list_authors(args.corpuspath, skip_prefix='Epistola')
    else:
        if (args.positive not in AUTHORS_CORPUS_I) and (args.positive in AUTHORS_CORPUS_II):
            print(f'warning: author {args.positive} is not in the known list of authors for CORPUS I nor CORPUS II')
        assert args.positive in list_authors(args.corpuspath, skip_prefix='Epistola'), 'unexpected author'
        args.authors = [args.positive]
    assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.'
    assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist'
    assert args.unknown is None or os.path.exists(args.unknown), '"unknown file" does not exist'
    main()
--- a/src/data/features.py
+++ b/src/data/features.py
@ -367,6 +367,7 @@ class FeatureExtractor:
        self.feature_names = None
        self.wngrams_vectorizer = self.wngrams_selector = None
        self.cngrams_vectorizer = self.cngrams_selector = None
        self.feature_range = {}
    def fit_transform(self, positives, negatives):
        documents = positives + negatives
@ -423,11 +424,15 @@ class FeatureExtractor:
        else:
            return TEST
-    def _addfeatures(self, X, F, feat_names=None):
+    def _addfeatures(self, X, F, feat_set_name, feat_names=None):
        if self.normalize_features:
            normalize(F, axis=1, copy=False)
        self._register_feature_names(feat_names)
        last_col, n_cols = X.shape[1], F.shape[1]
        self.feature_range[feat_set_name] = slice(last_col, last_col+n_cols)
        print('adding feat-set slice ', feat_set_name, self.feature_range[feat_set_name])
        if issparse(F):
            return hstack((X, F))  # sparse
        else:
@ -445,6 +450,16 @@ class FeatureExtractor:
            self.feature_names = []
        self.feature_names.extend(feat_names)
    def get_feature_set(self, X, name):
        assert name in self.feature_range, 'unknown feature set name'
        return X[:,self.feature_range[name]]
    def get_feature_set_names(self):
        return list(self.feature_range.keys())
    def get_feature_names(self):
        return self.feature_names
    def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1):
        # initialize the document-by-feature vector
        X = np.empty((len(documents), 0))
@ -501,9 +516,9 @@ class FeatureExtractor:
        for out in outs:
            taskname = out['task']
            if taskname not in {'_wngrams_task', '_cngrams_task'}:
-                X = self._addfeatures(X, out['features'], out['f_names'] if fit else None)
+                X = self._addfeatures(X, out['features'], taskname, out['f_names'] if fit else None)
            else:
-                X = self._addfeatures(_tocsr(X), out['features'], out['f_names'] if fit else None)
+                X = self._addfeatures(_tocsr(X), out['features'], taskname, out['f_names'] if fit else None)
                if fit:
                    vectorizer, selector = out['vectorizer'], out['selector']
                    if taskname == '_wngrams_task' and self.wngrams_vectorizer is None:
--- a/src/model.py
+++ b/src/model.py
@ -1,30 +1,32 @@
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.metrics import make_scorer
 from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold
 from sklearn.linear_model import LogisticRegression
 from data.features import *
 from util.calibration import CalibratedClassifierCV
 from util.evaluation import f1, get_counters
-class AuthorshipVerificator:
+class AuthorshipVerificator(BaseEstimator):
-    def __init__(self, nfolds=10,
+    def __init__(self,
-                 params={'C': np.logspace(-4, +3, 8)},
+                 nfolds=10,
                 param_grid={'C': np.logspace(-4, +3, 8)},
                 C=1.,
                 author_name=None):
        self.nfolds = nfolds
-        self.params = params
+        self.param_grid = param_grid
-        self.author_name = author_name if author_name else 'this author'
+        self.C = C
-        self.classifier = LogisticRegression(C=C, class_weight='balanced')
+        self.author_name = author_name
    def fit(self, X, y):
        self.classifier = LogisticRegression(C=self.C, class_weight='balanced')
        y = np.asarray(y)
        positive_examples = y.sum()
-        if positive_examples >= self.nfolds and self.params is not None:
+        if positive_examples >= self.nfolds and self.param_grid is not None:
            print('optimizing {}'.format(self.classifier.__class__.__name__))
            folds = list(StratifiedKFold(n_splits=self.nfolds, shuffle=True, random_state=42).split(X, y))
            self.estimator = GridSearchCV(
-                self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1
+                self.classifier, param_grid=self.param_grid, cv=folds, scoring=make_scorer(f1), n_jobs=-1
            )
        else:
            self.estimator = self.classifier
@ -36,44 +38,9 @@ class AuthorshipVerificator:
            print(f'Best params: {self.estimator.best_params_} (cross-validation F1={f1_mean:.3f})')
            self.estimator = self.estimator.best_estimator_
        #self.estimator = CalibratedClassifierCV(base_estimator=self.estimator, cv=self.nfolds, ensemble=False)
        #self.estimator.fit(X, y)
        return self
-    def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
+    def predict_with_fragments(self, test):
        if groups is None:
            print('Computing LOO without groups')
            folds = list(LeaveOneOut().split(X, y))
        else:
            print('Computing LOO with groups')
            logo = LeaveOneGroupOut()
            folds = list(logo.split(X, y, groups))
            if test_lowest_index_only:
                print('ignoring fragments')
                folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
        scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
        missclassified = files[scores == 0].tolist()
        #if hasattr(self.estimator, 'predict_proba') and len(missclassified) > 0:
        #    missclassified_prob = self.estimator.predict_proba(csr_matrix(X)[scores == 0])[:, 1]
        #    missclassified_prob = missclassified_prob.flatten().tolist()
        #    missclassified = [f'{file} Pr={prob:.3f}' for file, prob in zip(missclassified,missclassified_prob)]
        print('missclassified texts:')
        print('\n'.join(missclassified))
        if counters and test_lowest_index_only:
            yfull_true = y[:len(folds)]
            yfull_predict = np.zeros_like(yfull_true)
            yfull_predict[scores == 1] = yfull_true[scores == 1]
            yfull_predict[scores != 1] = 1-yfull_true[scores != 1]
            tp, fp, fn, tn = get_counters(yfull_true, yfull_predict)
            return scores.mean(), scores.std(), tp, fp, fn, tn
        else:
            return scores.mean(), scores.std()
    def predict(self, test):
        pred = self.estimator.predict(test)
        full_doc_prediction = pred[0]
        if len(pred) > 1:
@ -82,7 +49,10 @@ class AuthorshipVerificator:
            return full_doc_prediction, fragment_predictions
        return full_doc_prediction
-    def predict_proba(self, test):
+    def predict(self, test):
        return self.estimator.predict(test)
    def predict_proba_with_fragments(self, test):
        assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
        pred = self.estimator.predict_proba(test)
        full_doc_prediction = pred[0,1]
@ -92,5 +62,56 @@ class AuthorshipVerificator:
            return full_doc_prediction, fragment_predictions
        return full_doc_prediction, []
    def predict_proba(self, test):
        assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
        return self.estimator.predict_proba(test)
 def leave_one_out(model, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
    if groups is None:
        print(f'Computing LOO without groups over {X.shape[0]} documents')
        folds = list(LeaveOneOut().split(X, y))
    else:
        print(f'Computing LOO with groups over {X.shape[0]} documents')
        logo = LeaveOneGroupOut()
        folds = list(logo.split(X, y, groups))
        if test_lowest_index_only:
            print('ignoring fragments')
            folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
    print(f'optimizing via grid search each o the {len(folds)} prediction problems')
    scores = cross_val_score(model, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1, verbose=10)
    missclassified = files[scores == 0].tolist()
    #if hasattr(self.estimator, 'predict_proba') and len(missclassified) > 0:
    #    missclassified_prob = self.estimator.predict_proba(csr_matrix(X)[scores == 0])[:, 1]
    #    missclassified_prob = missclassified_prob.flatten().tolist()
    #    missclassified = [f'{file} Pr={prob:.3f}' for file, prob in zip(missclassified,missclassified_prob)]
    print('missclassified texts:')
    print('\n'.join(missclassified))
    if counters and test_lowest_index_only:
        yfull_true = y[:len(folds)]
        yfull_predict = np.zeros_like(yfull_true)
        yfull_predict[scores == 1] = yfull_true[scores == 1]
        yfull_predict[scores != 1] = 1-yfull_true[scores != 1]
        tp, fp, fn, tn = get_counters(yfull_true, yfull_predict)
        return scores.mean(), scores.std(), tp, fp, fn, tn
    else:
        return scores.mean(), scores.std()
 class RangeFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, range: slice, feat_sel_ratio: float):
        self.range = range
        self.feat_sel_ratio = feat_sel_ratio
    def fit(self, X, y):
        nF = self.range.stop-self.range.start
        num_feats = int(self.feat_sel_ratio * nF)
        self.selector = SelectKBest(chi2, k=num_feats)
        self.selector.fit(X[:,self.range], y)
        return self
    def transform(self, X):
        Z = self.selector.transform(X[:,self.range])
        return csr_matrix(hstack([X[:,:self.range.start], Z, X[:,self.range.stop:]]))