From 843cfbe8fe646ceac7042a34252696e814ddedb2 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Fri, 3 Apr 2020 11:21:09 +0200
Subject: [PATCH] feature extraction fully parallelized; result log file added;
 cleaning

---
 src/author_identification.py       |  34 ++++--
 src/data/features.py               | 163 +++++++++--------------------
 src/model.py                       |  37 +++----
 src/util/_hide_sklearn_warnings.py |   4 +
 4 files changed, 89 insertions(+), 149 deletions(-)
 create mode 100644 src/util/_hide_sklearn_warnings.py
diff --git a/src/author_identification.py b/src/author_identification.py
index fed7377..5133808 100755
--- a/src/author_identification.py
+++ b/src/author_identification.py
@@ -1,3 +1,4 @@
+import util._hide_sklearn_warnings
 from sklearn.linear_model import LogisticRegression
 from data.dante_loader import load_latin_corpus, list_authors
 from data.features import *
@@ -14,6 +15,7 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn
 
 
 def main():
+    log = open(args.log, 'wt')
     discarded = 0
     f1_scores = []
     counters = []
@@ -30,6 +32,7 @@ def main():
         files = np.asarray(pos_files + neg_files)
         if len(positive) < 2:
             discarded += 1
+            print(f'discarding analysis for {author} which has only {len(positive)} documents')
             continue
 
         n_full_docs = len(positive) + len(negative)
@@ -53,13 +56,14 @@ def main():
         Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
 
         print('Fitting the Verificator')
-        av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
-        av.fit(Xtr, ytr, groups)
+        av = AuthorshipVerificator(nfolds=10)
+        av.fit(Xtr, ytr)
 
         if args.unknown:
             print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
             ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
-            av.predict_proba(ep, args.unknown)
+            pred, _ = av.predict_proba(ep)
+            tee(f'{args.unknown}: Posterior probability for {author} is {pred:.3f}', log)
 
         if args.loo:
             print('Validating the Verificator (Leave-One-Out)')
@@ -68,7 +72,7 @@ def main():
             )
             f1_scores.append(f1_from_counters(tp, fp, fn, tn))
             counters.append((tp, fp, fn, tn))
-            print(f'F1 for {author} = {f1_scores[-1]:.3f}')
+            tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log)
 
     if args.loo:
         print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
@@ -78,26 +82,35 @@ def main():
         macro_f1 = f1_scores.mean()
         micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
 
-        print(f'Macro-F1 = {macro_f1:.3f}')
-        print(f'Micro-F1 = {micro_f1:.3f}')
+        tee(f'LOO Macro-F1 = {macro_f1:.3f}', log)
+        tee(f'LOO Micro-F1 = {micro_f1:.3f}', log)
         print()
 
+    log.close()
+
+def tee(msg, log):
+    print(msg)
+    log.write(f'{msg}\n')
+    log.flush()
+
 
 if __name__ == '__main__':
     import os
 
     # Training settings
     parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII')
-    parser.add_argument('corpuspath', type=str, metavar='PATH',
+    parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH',
                         help=f'Path to the directory containing the corpus (documents must be named '
-                             f'<author>_<texname>.txt')
-    parser.add_argument('positive', type=str, default="Dante",
+                             f'<author>_<texname>.txt)')
+    parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
                         help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
                               f'every author')
     parser.add_argument('--loo', default=False, action='store_true',
                         help='submit each binary classifier to leave-one-out validation')
     parser.add_argument('--unknown', type=str, metavar='PATH', default=None,
                         help='path to the file of unknown paternity (default None)')
+    parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt',
+                        help='path to the log file where to write the results (default ./results.txt)')
 
     args = parser.parse_args()
 
@@ -110,6 +123,7 @@ if __name__ == '__main__':
         args.authors = [args.positive]
 
     assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.'
-    assert args.unknown is None or os.path.exists(args.unknown), 'unknown file does not exist'
+    assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist'
+    assert args.unknown is None or os.path.exists(args.unknown), '"unknown file" does not exist'
 
     main()
diff --git a/src/data/features.py b/src/data/features.py
index c9b25ed..8c9c4c6 100755
--- a/src/data/features.py
+++ b/src/data/features.py
@@ -6,7 +6,7 @@ from sklearn.feature_selection import chi2
 from sklearn.preprocessing import normalize
 from scipy.sparse import hstack, csr_matrix, issparse
 from nltk.corpus import stopwords
-from sklearn.externals.joblib import Parallel, delayed
+from joblib import Parallel, delayed
 
 
 latin_function_words = ['et',  'in',  'de',  'ad',  'non',  'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic',
@@ -119,7 +119,7 @@ def _features_function_words_freq(documents, lang):
     """
     Extract features as the frequency (L1x1000) of the function words used in the documents
     :param documents: a list where each element is the text (string) of a document
-    :return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
+    :return: a dictionary containing the resulting features, feature names, and taskname
     """
     features = []
     function_words = get_function_words(lang)
@@ -134,7 +134,7 @@ def _features_function_words_freq(documents, lang):
     f_names = [f'funcw::{f}' for f in function_words]
     F = np.array(features)
     print(f'task function words (#features={F.shape[1]}) [Done]')
-    return F, f_names
+    return {'features': F, 'f_names':f_names, 'task': 'functionwords'}
 
 
 def _features_conjugations_freq(documents, lang):
@@ -142,7 +142,7 @@ def _features_conjugations_freq(documents, lang):
     Extract features as the frequency (L1x1000) of the conjugations used in the documents. The method is heuristic, and
     actually searches for suffixes contained in the conjugation list.
     :param documents: a list where each element is the text (string) of a document
-    :return: a np.array of shape (D,F) where D is len(documents) and F is len(conjugations)
+    :return: a dictionary containing the resulting features, feature names, and taskname
     """
     features = []
     conjugations = get_conjugations(lang)
@@ -162,7 +162,7 @@ def _features_conjugations_freq(documents, lang):
     f_names = [f'conj::{f}' for f in conjugations]
     F = np.array(features)
     print(f'task conjugation features (#features={F.shape[1]}) [Done]')
-    return F, f_names
+    return {'features': F, 'f_names':f_names, 'task': 'conjugations'}
 
 
 def _features_Mendenhall(documents, upto=23):
@@ -170,7 +170,7 @@ def _features_Mendenhall(documents, upto=23):
     Extract features as the frequency (L1x1000) of the words' lengths used in the documents,
     following the idea behind Mendenhall's Characteristic Curve of Composition
     :param documents: a list where each element is the text (string) of a document
-    :return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
+    :return: a dictionary containing the resulting features, feature names, and taskname
     """
     features = []
     for text in documents:
@@ -185,7 +185,7 @@ def _features_Mendenhall(documents, upto=23):
     f_names = [f'mendenhall::{c}' for c in range(1,upto)]
     F = np.array(features)
     print(f'task Mendenhall features (#features={F.shape[1]}) [Done]')
-    return F, f_names
+    return {'features': F, 'f_names':f_names, 'task': 'Mendenhall'}
 
 
 def _features_sentenceLengths(documents, downto=3, upto=70):
@@ -194,7 +194,7 @@ def _features_sentenceLengths(documents, downto=3, upto=70):
     :param documents: a list where each element is the text (string) of a document
     :param downto: minimal length considered
     :param upto: maximum length considered
-    :return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
+    :return: a dictionary containing the resulting features, feature names, and taskname
     """
     features = []
     for text in documents:
@@ -212,15 +212,14 @@ def _features_sentenceLengths(documents, downto=3, upto=70):
     f_names = [f'sentlength::{c}' for c in range(downto, upto)]
     F = np.array(features)
     print(f'task sentence lengths (#features={F.shape[1]}) [Done]')
-    return F, f_names
+    return {'features': F, 'f_names':f_names, 'task': 'sentlength'}
 
 
 def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=1, ngrams=(1, 1)):
     """
     Extract features as tfidf matrix extracted from the documents
     :param documents: a list where each element is the text (string) of a document
-    :return: a tuple M,V, where M is an np.array of shape (D,F), with D being the len(documents) and F the number of
-    distinct words; and V is the TfidfVectorizer already fit
+    :return: a dictionary containing the resulting features, feature names, taskname, the vectorizer and the selector
     """
     if vectorizer is None:
         vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams)
@@ -238,7 +237,13 @@ def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, fea
         f_names = [f_names[i] for i in selector.get_support(indices=True)]
 
     print(f'task ngrams and feature selection (#features={features.shape[1]}) [Done]')
-    return features, f_names, vectorizer, selector
+    return {
+        'features': features,
+        'f_names': f_names,
+        'task': '_wngrams_task',
+        'vectorizer': vectorizer,
+        'selector': selector
+    }
 
 
 def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=10, preserve_punctuation=True, ngrams=[4, 5]):
@@ -253,24 +258,27 @@ def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, fea
     :param vectorizer: the tfidf_vectorizer to use if already fit; if None, a new one will be instantiated and fit
     :param min_df: minumum number of occurrences needed for the ngram to be taken
     :param preserve_punctuation: whether or not to preserve punctuation marks
-    :return: see _features_tfidf
+    :return: a dictionary containing the resulting features, feature names, taskname, the vectorizer and the selector
     """
     doc_ngrams = ngrams_extractor(documents, ngrams, preserve_punctuation)
-    return _features_word_ngrams(
+    outs = _features_word_ngrams(
         doc_ngrams,
         vectorizer=vectorizer,
         selector=selector, y=y, feat_sel_ratio=feat_sel_ratio,
         min_df=min_df
     )
+    outs['task'] = '_cngrams_task'
+    return outs
 
 
 def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True):
-    if not isinstance(ns, list): ns=[ns]
+    if not isinstance(ns, list):
+        ns=[ns]
     ns = sorted(np.unique(ns).tolist())
 
     list_ngrams = []
     for doc in documents:
-        if preserve_punctuation == False:
+        if not preserve_punctuation:
             doc = ' '.join(tokenize(doc))
         doc_ngrams = []
         for ni in ns:
@@ -287,7 +295,7 @@ def _feature_selection(X, y, tfidf_feat_selection_ratio):
     :param X: a document by (sparse) features matrix
     :param y: the supervised ndarray containing the class labels
     :param tfidf_feat_selection_ratio: a proportion of features to be taken
-    :return: the reduced matrix and the feature selector fit
+    :return: the feature selector fit
     """
     nF = X.shape[1]
     num_feats = int(tfidf_feat_selection_ratio * nF)
@@ -321,7 +329,7 @@ class FeatureExtractor:
                  window_size=5,
                  verbose=True):
         """
-        Applies stlystic feature extraction. Features include:
+        Applies stilystic feature extraction. Features include:
         :param function_words_freq: add the frequency of function words as features
         :param conjugations_freq: add the frequency of regular conjugations as features
         :param features_Mendenhall: add the frequencies of the words' lengths as features
@@ -437,113 +445,31 @@ class FeatureExtractor:
             self.feature_names = []
         self.feature_names.extend(feat_names)
 
-    def _transform(self, documents, y=None, fit=False):
-        # initialize the document-by-feature vector
-        X = np.empty((len(documents), 0))
-
-        # dense feature extraction functions
-        if self.function_words_freq:
-            F, f_names = _features_function_words_freq(documents, self.function_words_freq)
-            X = self._addfeatures(X, F, f_names if fit else None)
-            self._print(f'adding function words features: {X.shape[1]} features')
-
-        if self.conjugations_freq:
-            F, f_names = _features_conjugations_freq(documents, self.conjugations_freq)
-            X = self._addfeatures(X, F, f_names if fit else None)
-            self._print(f'adding conjugation features: {X.shape[1]} features')
-
-        if self.features_Mendenhall:
-            F, f_names = _features_Mendenhall(documents)
-            X = self._addfeatures(X, F, f_names if fit else None)
-            self._print(f'adding Mendenhall words features: {X.shape[1]} features')
-
-        if self.features_sentenceLengths:
-            F, f_names = _features_sentenceLengths(documents)
-            X = self._addfeatures(X, F, f_names if fit else None)
-            self._print(f'adding sentence lengths features: {X.shape[1]} features')
-
-        # sparse feature extraction functions
-        if self.wngrams:
-            if fit:
-                X_features, self.wngrams_vectorizer = _features_word_ngrams(documents, ngrams=self.wngrams_range)
-                index2word = {i: w for w, i in self.wngrams_vectorizer.vocabulary_.items()}
-                f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))]
-            else:
-                X_features, _ = _features_word_ngrams(documents, self.wngrams_vectorizer)
-                f_names = None
-
-            if self.feature_selection_ratio < 1.:
-                if self.verbose: print('feature selection')
-                if fit:
-                    X_features, self.feat_sel_tfidf = _feature_selection(X_features, y, self.feature_selection_ratio)
-                    f_names = [f_names[i] for i in self.feat_sel_tfidf.get_support(indices=True)]
-                else:
-                    X_features = self.feat_sel_tfidf.transform(X_features)
-            X = self._addfeatures(_tocsr(X), X_features, f_names)
-            self._print(f'adding tfidf words features: {X.shape[1]} features')
-
-        if self.cngrams:
-            if fit:
-                X_features, self.cngrams_vectorizer = _features_char_ngrams(
-                    documents, self.cngrams_range, preserve_punctuation=self.preserve_punctuation
-                )
-                index2word = {i: w for w, i in self.cngrams_vectorizer.vocabulary_.items()}
-                f_names = [f'ngram::{index2word[i]}' for i in range(len(index2word))]
-            else:
-                X_features, _ = _features_char_ngrams(
-                    documents, self.cngrams_range, vectorizer=self.cngrams_vectorizer,
-                    preserve_punctuation=self.preserve_punctuation
-                )
-                f_names = None
-
-            if self.feature_selection_ratio < 1.:
-                if self.verbose: print('feature selection')
-                if fit:
-                    X_features, self.cngrams_selector = _feature_selection(X_features, y, self.feature_selection_ratio)
-                    f_names = [f_names[i] for i in self.cngrams_selector.get_support(indices=True)]
-                else:
-                    X_features = self.cngrams_selector.transform(X_features)
-
-            X = self._addfeatures(_tocsr(X), X_features, f_names)
-            self._print(f'adding ngrams character features: {X.shape[1]} features')
-
-        if fit:
-            self.feature_names = np.asarray(self.feature_names)
-
-        self._print(f'X shape (#documents,#features): {X.shape}')
-
-        return X
-
     def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1):
         # initialize the document-by-feature vector
         X = np.empty((len(documents), 0))
 
         tasks = []
+
         # dense feature extraction functions
         if self.function_words_freq:
-            tasks.append((_features_function_words_freq, (documents, self.function_words_freq)))
+            tasks.append((_features_function_words_freq, {'documents': documents, 'lang': self.function_words_freq}))
 
         if self.conjugations_freq:
-            tasks.append((_features_conjugations_freq, (documents, self.conjugations_freq)))
+            tasks.append((_features_conjugations_freq, {'documents': documents, 'lang': self.conjugations_freq}))
 
         if self.features_Mendenhall:
-            tasks.append((_features_Mendenhall, (documents, 23)))
+            tasks.append((_features_Mendenhall, {'documents': documents, 'upto': 23}))
 
         if self.features_sentenceLengths:
-            tasks.append((_features_sentenceLengths, (documents, 3, 70)))
-
-        self._print('extracting dense features in parallel')
-        outs = Parallel(n_jobs=n_jobs)(delayed(task)(*params) for task, params in tasks)
-        for F, feat_names in outs:
-            X = self._addfeatures(X, F, feat_names if fit else None)
+            tasks.append((_features_sentenceLengths, {'documents': documents, 'downto': 3, 'upto': 70}))
 
         # sparse feature extraction functions
-        tasks = []
         if self.wngrams:
             if not fit and self.wngrams_vectorizer is None:
                 raise ValueError('transform called before fit')
 
-            params={
+            params = {
                 'documents': documents,
                 'vectorizer': self.wngrams_vectorizer,
                 'selector': self.wngrams_selector,
@@ -557,7 +483,7 @@ class FeatureExtractor:
             if not fit and self.cngrams_vectorizer is None:
                 raise ValueError('transform called before fit')
 
-            params={
+            params = {
                 'documents': documents,
                 'vectorizer': self.cngrams_vectorizer,
                 'selector': self.cngrams_selector,
@@ -568,15 +494,22 @@ class FeatureExtractor:
             }
             tasks.append((_features_char_ngrams, params))
 
-        self._print('extracting sparse features in parallel')
+        self._print('extracting features in parallel')
         outs = Parallel(n_jobs=n_jobs)(delayed(task)(**params) for task, params in tasks)
-        for F, feat_names, vectorizer, selector in outs:
-            X = self._addfeatures(_tocsr(X), F, feat_names if fit else None)
-            if fit:
-                if self.wngrams and self.wngrams_vectorizer is None:
-                    self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector
-                elif self.cngrams and self.cngrams_vectorizer is None:
-                    self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector
+
+        # gather the tasks' outputs
+        for out in outs:
+            taskname = out['task']
+            if taskname not in {'_wngrams_task', '_cngrams_task'}:
+                X = self._addfeatures(X, out['features'], out['f_names'] if fit else None)
+            else:
+                X = self._addfeatures(_tocsr(X), out['features'], out['f_names'] if fit else None)
+                if fit:
+                    vectorizer, selector = out['vectorizer'], out['selector']
+                    if taskname == '_wngrams_task' and self.wngrams_vectorizer is None:
+                        self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector
+                    elif taskname == '_cngrams_task' and self.cngrams_vectorizer is None:
+                        self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector
 
         if fit:
             self.feature_names = np.asarray(self.feature_names)
diff --git a/src/model.py b/src/model.py
index 54bfb94..137fbbe 100755
--- a/src/model.py
+++ b/src/model.py
@@ -1,6 +1,6 @@
 from sklearn.metrics import make_scorer
 from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
 from sklearn.svm import *
 from data.features import *
 from util.evaluation import f1, get_counters
@@ -10,29 +10,21 @@ class AuthorshipVerificator:
 
     def __init__(self, nfolds=10,
                  params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]},
-                 estimator=SVC,
                  author_name=None):
         self.nfolds = nfolds
         self.params = params
         self.author_name = author_name if author_name else 'this author'
-        if estimator is SVC:
-            self.params['kernel'] = ['linear', 'rbf']
-            self.probability = True
-            self.classifier = estimator(probability=self.probability)
-        elif estimator is LinearSVC:
-            self.probability = False
-            self.classifier = estimator()
-        elif estimator is LogisticRegression:
-            self.probability = True
-            self.classifier = LogisticRegression()
+        self.classifier = LogisticRegression()
 
-    def fit(self,X,y,groups=None):
-        if not isinstance(y,np.ndarray): y=np.array(y)
+    def fit(self, X, y):
+        y = np.asarray(y)
         positive_examples = y.sum()
         if positive_examples >= self.nfolds:
             print('optimizing {}'.format(self.classifier.__class__.__name__))
             folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
-            self.estimator = GridSearchCV(self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
+            self.estimator = GridSearchCV(
+                self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1
+            )
         else:
             self.estimator = self.classifier
 
@@ -46,7 +38,6 @@ class AuthorshipVerificator:
         return self
 
     def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
-
         if groups is None:
             print('Computing LOO without groups')
             folds = list(LeaveOneOut().split(X, y))
@@ -59,8 +50,8 @@ class AuthorshipVerificator:
                 folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
 
         scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
-        missclassified = '\n'.join(files[scores==0].tolist())
-        print(scores)
+        missclassified = '\n'.join(files[scores == 0].tolist())
+        print('missclassified texts:')
         print(missclassified)
 
         if counters and test_lowest_index_only:
@@ -73,26 +64,24 @@ class AuthorshipVerificator:
         else:
             return scores.mean(), scores.std()
 
-    def predict(self, test, epistola_name=''):
+    def predict(self, test):
         pred = self.estimator.predict(test)
         full_doc_prediction = pred[0]
-        print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
         if len(pred) > 1:
             fragment_predictions = pred[1:]
             print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
             return full_doc_prediction, fragment_predictions
-        return full_doc_prediction, None
+        return full_doc_prediction
 
-    def predict_proba(self, test, epistola_name=''):
+    def predict_proba(self, test):
         assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
         pred = self.estimator.predict_proba(test)
         full_doc_prediction = pred[0,1]
-        print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
         if len(pred) > 1:
             fragment_predictions = pred[1:,1]
             print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
             return full_doc_prediction, fragment_predictions
-        return full_doc_prediction, None
+        return full_doc_prediction, []
 
 
 
diff --git a/src/util/_hide_sklearn_warnings.py b/src/util/_hide_sklearn_warnings.py
new file mode 100644
index 0000000..02fb2de
--- /dev/null
+++ b/src/util/_hide_sklearn_warnings.py
@@ -0,0 +1,4 @@
+def warn(*args, **kwargs):
+    pass
+import warnings
+warnings.warn = warn