From 4c6c92f6d4eb0641cbc39e7744f514f371714504 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Fri, 20 Nov 2020 21:41:47 +0100
Subject: [PATCH] improving experimental protocol

---
 src/author_identification.py         |  21 ++--
 src/author_identification_loo.py     | 134 ++++++++++++++++++++++++
 src/author_identification_unknown.py | 146 +++++++++++++++++++++++++++
 src/data/features.py                 |  21 +++-
 src/model.py                         | 113 ++++++++++++---------
 5 files changed, 378 insertions(+), 57 deletions(-)
 create mode 100755 src/author_identification_loo.py
 create mode 100755 src/author_identification_unknown.py

diff --git a/src/author_identification.py b/src/author_identification.py
index 6c984cb..ff1c894 100755
--- a/src/author_identification.py
+++ b/src/author_identification.py
@@ -1,10 +1,8 @@
 import util._hide_sklearn_warnings
-from sklearn.linear_model import LogisticRegression
 from data.dante_loader import load_latin_corpus, list_authors
 from data.features import *
 from model import AuthorshipVerificator
 from util.evaluation import f1_from_counters
-from sklearn.calibration import CalibratedClassifierCV
 import argparse
 
 AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
@@ -15,6 +13,8 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn
                            'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
 
 
+DEBUG_MODE=True
+
 def main():
     log = open(args.log, 'wt')
     discarded = 0
@@ -44,7 +44,7 @@ def main():
             conjugations_freq='latin',
             features_Mendenhall=True,
             features_sentenceLengths=True,
-            feature_selection_ratio=0.1,
+            feature_selection_ratio=0.1 if DEBUG_MODE else 1,
             wordngrams=True, n_wordngrams=(1, 2),
             charngrams=True, n_charngrams=(3, 4, 5),
             preserve_punctuation=False,
@@ -58,22 +58,23 @@ def main():
 
         print('Fitting the Verificator')
         if args.C is None:
-            params = {'C': np.logspace(-3, +3, 7)}
+            params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)}
             C = 1.
         else:
             params = None
             C = args.C
 
-        av = AuthorshipVerificator(C=C, params=params)
-        av.fit(Xtr, ytr)
-
         if args.unknown:
+            av = AuthorshipVerificator(C=C, param_grid=params)
+            av.fit(Xtr, ytr)
+
             print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
             ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
-            pred, _ = av.predict_proba(ep)
+            pred, _ = av.predict_proba_with_fragments(ep)
             tee(f'{args.unknown}: Posterior probability for {author} is {pred:.3f}', log)
 
         if args.loo:
+            av = AuthorshipVerificator(C=C, param_grid=params)
             print('Validating the Verificator (Leave-One-Out)')
             score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(
                 Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
@@ -97,6 +98,9 @@ def main():
 
     log.close()
 
+    if DEBUG_MODE:
+        print('DEBUG_MODE ON')
+
 
 def tee(msg, log):
     print(msg)
@@ -139,3 +143,4 @@ if __name__ == '__main__':
     assert args.unknown is None or os.path.exists(args.unknown), '"unknown file" does not exist'
 
     main()
+
diff --git a/src/author_identification_loo.py b/src/author_identification_loo.py
new file mode 100755
index 0000000..023315a
--- /dev/null
+++ b/src/author_identification_loo.py
@@ -0,0 +1,134 @@
+#import util._hide_sklearn_warnings
+from data.dante_loader import load_latin_corpus, list_authors
+from data.features import *
+from model import AuthorshipVerificator, RangeFeatureSelector, leave_one_out
+from util.evaluation import f1_from_counters
+import argparse
+from sklearn.pipeline import Pipeline
+
+AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
+AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
+                           'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', 'GrazioloBambaglioli', 'GuidoDaPisa',
+                           'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', 'IohannesDePlanoCarpini',
+                           'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus',
+                           'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
+
+
+DEBUG_MODE = False
+
+
+def main():
+    log = open(args.log, 'wt')
+    discarded = 0
+    f1_scores = []
+    counters = []
+    for i, author in enumerate(args.authors):
+        path = args.corpuspath
+        print('='*80)
+        print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})')
+        print(f'Corpus {path}')
+        print('-'*80)
+
+        positive, negative, pos_files, neg_files, ep_text = load_latin_corpus(path, positive_author=author)
+        files = np.asarray(pos_files + neg_files)
+        if len(positive) < 2:
+            discarded += 1
+            print(f'discarding analysis for {author} which has only {len(positive)} documents')
+            continue
+
+        n_full_docs = len(positive) + len(negative)
+        print(f'read {n_full_docs} documents from {path}')
+
+        feature_extractor = FeatureExtractor(
+            function_words_freq='latin',
+            conjugations_freq='latin',
+            features_Mendenhall=True,
+            features_sentenceLengths=True,
+            feature_selection_ratio=0.05 if DEBUG_MODE else 1,
+            wordngrams=True, n_wordngrams=(1, 2),
+            charngrams=True, n_charngrams=(3, 4, 5),
+            preserve_punctuation=False,
+            split_documents=True,
+            split_policy=split_by_sentences,
+            window_size=3,
+            normalize_features=True
+        )
+
+        Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
+
+        print('Fitting the Verificator')
+        params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)}
+
+        slice_charngrams = feature_extractor.feature_range['_cngrams_task']
+        slice_wordngrams = feature_extractor.feature_range['_wngrams_task']
+        if slice_charngrams.start < slice_wordngrams.start:
+            slice_first, slice_second = slice_charngrams, slice_wordngrams
+        else:
+            slice_first, slice_second = slice_wordngrams, slice_charngrams
+        av = Pipeline([
+            ('featsel_cngrams', RangeFeatureSelector(slice_second, 0.1)),
+            ('featsel_wngrams', RangeFeatureSelector(slice_first, 0.1)),
+            ('av', AuthorshipVerificator(C=1, param_grid=params))
+        ])
+
+        print('Validating the Verificator (Leave-One-Out)')
+        score_ave, score_std, tp, fp, fn, tn = leave_one_out(
+            av, Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
+        )
+        f1_scores.append(f1_from_counters(tp, fp, fn, tn))
+        counters.append((tp, fp, fn, tn))
+        tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log)
+        print(f'TP={tp} FP={fp} FN={fn} TN={tn}')
+
+    print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
+    f1_scores = np.array(f1_scores)
+    counters = np.array(counters)
+
+    macro_f1 = f1_scores.mean()
+    micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
+
+    tee(f'LOO Macro-F1 = {macro_f1:.3f}', log)
+    tee(f'LOO Micro-F1 = {micro_f1:.3f}', log)
+    print()
+
+    log.close()
+
+    if DEBUG_MODE:
+        print('DEBUG_MODE ON')
+
+
+def tee(msg, log):
+    print(msg)
+    log.write(f'{msg}\n')
+    log.flush()
+
+
+if __name__ == '__main__':
+    import os
+
+    # Training settings
+    parser = argparse.ArgumentParser(description='Authorship verification for MedLatin '
+                                                 'submit each binary classifier to leave-one-out validation')
+    parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH',
+                        help=f'Path to the directory containing the corpus (documents must be named '
+                             f'<author>_<texname>.txt)')
+    parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
+                        help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
+                              f'every author')
+    parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt',
+                        help='path to the log file where to write the results (default ./results.txt)')
+
+    args = parser.parse_args()
+
+    if args.positive == 'ALL':
+        args.authors = list_authors(args.corpuspath, skip_prefix='Epistola')
+    else:
+        if (args.positive not in AUTHORS_CORPUS_I) and (args.positive in AUTHORS_CORPUS_II):
+            print(f'warning: author {args.positive} is not in the known list of authors for CORPUS I nor CORPUS II')
+        assert args.positive in list_authors(args.corpuspath, skip_prefix='Epistola'), 'unexpected author'
+        args.authors = [args.positive]
+
+    assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist'
+
+    main()
+
diff --git a/src/author_identification_unknown.py b/src/author_identification_unknown.py
new file mode 100755
index 0000000..ff1c894
--- /dev/null
+++ b/src/author_identification_unknown.py
@@ -0,0 +1,146 @@
+import util._hide_sklearn_warnings
+from data.dante_loader import load_latin_corpus, list_authors
+from data.features import *
+from model import AuthorshipVerificator
+from util.evaluation import f1_from_counters
+import argparse
+
+AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
+AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
+                           'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', 'GrazioloBambaglioli', 'GuidoDaPisa',
+                           'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', 'IohannesDePlanoCarpini',
+                           'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus',
+                           'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
+
+
+DEBUG_MODE=True
+
+def main():
+    log = open(args.log, 'wt')
+    discarded = 0
+    f1_scores = []
+    counters = []
+    for i, author in enumerate(args.authors):
+        path = args.corpuspath
+        print('='*80)
+        print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})')
+        print(f'Corpus {path}')
+        print('-'*80)
+
+        positive, negative, pos_files, neg_files, ep_text = load_latin_corpus(
+            path, positive_author=author, unknown_target=args.unknown
+        )
+        files = np.asarray(pos_files + neg_files)
+        if len(positive) < 2:
+            discarded += 1
+            print(f'discarding analysis for {author} which has only {len(positive)} documents')
+            continue
+
+        n_full_docs = len(positive) + len(negative)
+        print(f'read {n_full_docs} documents from {path}')
+
+        feature_extractor = FeatureExtractor(
+            function_words_freq='latin',
+            conjugations_freq='latin',
+            features_Mendenhall=True,
+            features_sentenceLengths=True,
+            feature_selection_ratio=0.1 if DEBUG_MODE else 1,
+            wordngrams=True, n_wordngrams=(1, 2),
+            charngrams=True, n_charngrams=(3, 4, 5),
+            preserve_punctuation=False,
+            split_documents=True,
+            split_policy=split_by_sentences,
+            window_size=3,
+            normalize_features=True
+        )
+
+        Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
+
+        print('Fitting the Verificator')
+        if args.C is None:
+            params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)}
+            C = 1.
+        else:
+            params = None
+            C = args.C
+
+        if args.unknown:
+            av = AuthorshipVerificator(C=C, param_grid=params)
+            av.fit(Xtr, ytr)
+
+            print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
+            ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
+            pred, _ = av.predict_proba_with_fragments(ep)
+            tee(f'{args.unknown}: Posterior probability for {author} is {pred:.3f}', log)
+
+        if args.loo:
+            av = AuthorshipVerificator(C=C, param_grid=params)
+            print('Validating the Verificator (Leave-One-Out)')
+            score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(
+                Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
+            )
+            f1_scores.append(f1_from_counters(tp, fp, fn, tn))
+            counters.append((tp, fp, fn, tn))
+            tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log)
+            print(f'TP={tp} FP={fp} FN={fn} TN={tn}')
+
+    if args.loo:
+        print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
+        f1_scores = np.array(f1_scores)
+        counters = np.array(counters)
+
+        macro_f1 = f1_scores.mean()
+        micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
+
+        tee(f'LOO Macro-F1 = {macro_f1:.3f}', log)
+        tee(f'LOO Micro-F1 = {micro_f1:.3f}', log)
+        print()
+
+    log.close()
+
+    if DEBUG_MODE:
+        print('DEBUG_MODE ON')
+
+
+def tee(msg, log):
+    print(msg)
+    log.write(f'{msg}\n')
+    log.flush()
+
+
+if __name__ == '__main__':
+    import os
+
+    # Training settings
+    parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII')
+    parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH',
+                        help=f'Path to the directory containing the corpus (documents must be named '
+                             f'<author>_<texname>.txt)')
+    parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
+                        help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
+                              f'every author')
+    parser.add_argument('--loo', default=False, action='store_true',
+                        help='submit each binary classifier to leave-one-out validation')
+    parser.add_argument('--unknown', type=str, metavar='PATH', default=None,
+                        help='path to the file of unknown paternity (default None)')
+    parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt',
+                        help='path to the log file where to write the results (default ./results.txt)')
+    parser.add_argument('--C', type=float, metavar='C', default=None,
+                        help='set the parameter C (trade off between error and margin) or leave as None to optimize')
+
+    args = parser.parse_args()
+
+    if args.positive == 'ALL':
+        args.authors = list_authors(args.corpuspath, skip_prefix='Epistola')
+    else:
+        if (args.positive not in AUTHORS_CORPUS_I) and (args.positive in AUTHORS_CORPUS_II):
+            print(f'warning: author {args.positive} is not in the known list of authors for CORPUS I nor CORPUS II')
+        assert args.positive in list_authors(args.corpuspath, skip_prefix='Epistola'), 'unexpected author'
+        args.authors = [args.positive]
+
+    assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.'
+    assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist'
+    assert args.unknown is None or os.path.exists(args.unknown), '"unknown file" does not exist'
+
+    main()
+
diff --git a/src/data/features.py b/src/data/features.py
index 8c9c4c6..f1bef2f 100755
--- a/src/data/features.py
+++ b/src/data/features.py
@@ -367,6 +367,7 @@ class FeatureExtractor:
         self.feature_names = None
         self.wngrams_vectorizer = self.wngrams_selector = None
         self.cngrams_vectorizer = self.cngrams_selector = None
+        self.feature_range = {}
 
     def fit_transform(self, positives, negatives):
         documents = positives + negatives
@@ -423,11 +424,15 @@ class FeatureExtractor:
         else:
             return TEST
 
-    def _addfeatures(self, X, F, feat_names=None):
+    def _addfeatures(self, X, F, feat_set_name, feat_names=None):
         if self.normalize_features:
             normalize(F, axis=1, copy=False)
         self._register_feature_names(feat_names)
 
+        last_col, n_cols = X.shape[1], F.shape[1]
+        self.feature_range[feat_set_name] = slice(last_col, last_col+n_cols)
+        print('adding feat-set slice ', feat_set_name, self.feature_range[feat_set_name])
+
         if issparse(F):
             return hstack((X, F))  # sparse
         else:
@@ -445,6 +450,16 @@ class FeatureExtractor:
             self.feature_names = []
         self.feature_names.extend(feat_names)
 
+    def get_feature_set(self, X, name):
+        assert name in self.feature_range, 'unknown feature set name'
+        return X[:,self.feature_range[name]]
+
+    def get_feature_set_names(self):
+        return list(self.feature_range.keys())
+
+    def get_feature_names(self):
+        return self.feature_names
+
     def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1):
         # initialize the document-by-feature vector
         X = np.empty((len(documents), 0))
@@ -501,9 +516,9 @@ class FeatureExtractor:
         for out in outs:
             taskname = out['task']
             if taskname not in {'_wngrams_task', '_cngrams_task'}:
-                X = self._addfeatures(X, out['features'], out['f_names'] if fit else None)
+                X = self._addfeatures(X, out['features'], taskname, out['f_names'] if fit else None)
             else:
-                X = self._addfeatures(_tocsr(X), out['features'], out['f_names'] if fit else None)
+                X = self._addfeatures(_tocsr(X), out['features'], taskname, out['f_names'] if fit else None)
                 if fit:
                     vectorizer, selector = out['vectorizer'], out['selector']
                     if taskname == '_wngrams_task' and self.wngrams_vectorizer is None:
diff --git a/src/model.py b/src/model.py
index 0fc17e4..45b82f0 100755
--- a/src/model.py
+++ b/src/model.py
@@ -1,30 +1,32 @@
+from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.metrics import make_scorer
 from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold
 from sklearn.linear_model import LogisticRegression
 from data.features import *
-from util.calibration import CalibratedClassifierCV
 from util.evaluation import f1, get_counters
 
 
-class AuthorshipVerificator:
+class AuthorshipVerificator(BaseEstimator):
 
-    def __init__(self, nfolds=10,
-                 params={'C': np.logspace(-4, +3, 8)},
+    def __init__(self,
+                 nfolds=10,
+                 param_grid={'C': np.logspace(-4, +3, 8)},
                  C=1.,
                  author_name=None):
         self.nfolds = nfolds
-        self.params = params
-        self.author_name = author_name if author_name else 'this author'
-        self.classifier = LogisticRegression(C=C, class_weight='balanced')
+        self.param_grid = param_grid
+        self.C = C
+        self.author_name = author_name
 
     def fit(self, X, y):
+        self.classifier = LogisticRegression(C=self.C, class_weight='balanced')
         y = np.asarray(y)
         positive_examples = y.sum()
-        if positive_examples >= self.nfolds and self.params is not None:
+        if positive_examples >= self.nfolds and self.param_grid is not None:
             print('optimizing {}'.format(self.classifier.__class__.__name__))
             folds = list(StratifiedKFold(n_splits=self.nfolds, shuffle=True, random_state=42).split(X, y))
             self.estimator = GridSearchCV(
-                self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1
+                self.classifier, param_grid=self.param_grid, cv=folds, scoring=make_scorer(f1), n_jobs=-1
             )
         else:
             self.estimator = self.classifier
@@ -36,44 +38,9 @@ class AuthorshipVerificator:
             print(f'Best params: {self.estimator.best_params_} (cross-validation F1={f1_mean:.3f})')
             self.estimator = self.estimator.best_estimator_
 
-        #self.estimator = CalibratedClassifierCV(base_estimator=self.estimator, cv=self.nfolds, ensemble=False)
-        #self.estimator.fit(X, y)
-
         return self
 
-    def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
-        if groups is None:
-            print('Computing LOO without groups')
-            folds = list(LeaveOneOut().split(X, y))
-        else:
-            print('Computing LOO with groups')
-            logo = LeaveOneGroupOut()
-            folds = list(logo.split(X, y, groups))
-            if test_lowest_index_only:
-                print('ignoring fragments')
-                folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
-
-        scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
-        missclassified = files[scores == 0].tolist()
-        #if hasattr(self.estimator, 'predict_proba') and len(missclassified) > 0:
-        #    missclassified_prob = self.estimator.predict_proba(csr_matrix(X)[scores == 0])[:, 1]
-        #    missclassified_prob = missclassified_prob.flatten().tolist()
-        #    missclassified = [f'{file} Pr={prob:.3f}' for file, prob in zip(missclassified,missclassified_prob)]
-        print('missclassified texts:')
-        print('\n'.join(missclassified))
-
-
-        if counters and test_lowest_index_only:
-            yfull_true = y[:len(folds)]
-            yfull_predict = np.zeros_like(yfull_true)
-            yfull_predict[scores == 1] = yfull_true[scores == 1]
-            yfull_predict[scores != 1] = 1-yfull_true[scores != 1]
-            tp, fp, fn, tn = get_counters(yfull_true, yfull_predict)
-            return scores.mean(), scores.std(), tp, fp, fn, tn
-        else:
-            return scores.mean(), scores.std()
-
-    def predict(self, test):
+    def predict_with_fragments(self, test):
         pred = self.estimator.predict(test)
         full_doc_prediction = pred[0]
         if len(pred) > 1:
@@ -82,7 +49,10 @@ class AuthorshipVerificator:
             return full_doc_prediction, fragment_predictions
         return full_doc_prediction
 
-    def predict_proba(self, test):
+    def predict(self, test):
+        return self.estimator.predict(test)
+
+    def predict_proba_with_fragments(self, test):
         assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
         pred = self.estimator.predict_proba(test)
         full_doc_prediction = pred[0,1]
@@ -92,5 +62,56 @@ class AuthorshipVerificator:
             return full_doc_prediction, fragment_predictions
         return full_doc_prediction, []
 
+    def predict_proba(self, test):
+        assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
+        return self.estimator.predict_proba(test)
 
 
+def leave_one_out(model, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
+    if groups is None:
+        print(f'Computing LOO without groups over {X.shape[0]} documents')
+        folds = list(LeaveOneOut().split(X, y))
+    else:
+        print(f'Computing LOO with groups over {X.shape[0]} documents')
+        logo = LeaveOneGroupOut()
+        folds = list(logo.split(X, y, groups))
+        if test_lowest_index_only:
+            print('ignoring fragments')
+            folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
+
+    print(f'optimizing via grid search each o the {len(folds)} prediction problems')
+    scores = cross_val_score(model, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1, verbose=10)
+    missclassified = files[scores == 0].tolist()
+    #if hasattr(self.estimator, 'predict_proba') and len(missclassified) > 0:
+    #    missclassified_prob = self.estimator.predict_proba(csr_matrix(X)[scores == 0])[:, 1]
+    #    missclassified_prob = missclassified_prob.flatten().tolist()
+    #    missclassified = [f'{file} Pr={prob:.3f}' for file, prob in zip(missclassified,missclassified_prob)]
+    print('missclassified texts:')
+    print('\n'.join(missclassified))
+
+    if counters and test_lowest_index_only:
+        yfull_true = y[:len(folds)]
+        yfull_predict = np.zeros_like(yfull_true)
+        yfull_predict[scores == 1] = yfull_true[scores == 1]
+        yfull_predict[scores != 1] = 1-yfull_true[scores != 1]
+        tp, fp, fn, tn = get_counters(yfull_true, yfull_predict)
+        return scores.mean(), scores.std(), tp, fp, fn, tn
+    else:
+        return scores.mean(), scores.std()
+
+
+class RangeFeatureSelector(BaseEstimator, TransformerMixin):
+    def __init__(self, range: slice, feat_sel_ratio: float):
+        self.range = range
+        self.feat_sel_ratio = feat_sel_ratio
+
+    def fit(self, X, y):
+        nF = self.range.stop-self.range.start
+        num_feats = int(self.feat_sel_ratio * nF)
+        self.selector = SelectKBest(chi2, k=num_feats)
+        self.selector.fit(X[:,self.range], y)
+        return self
+
+    def transform(self, X):
+        Z = self.selector.transform(X[:,self.range])
+        return csr_matrix(hstack([X[:,:self.range.start], Z, X[:,self.range.stop:]]))