feature extraction fully parallelized; result log file added; cleaning

2020-04-03 11:21:09 +02:00 · 2020-04-03 11:21:09 +02:00 · 843cfbe8fe
parent a3893c77fe
commit 843cfbe8fe
4 changed files with 89 additions and 149 deletions
--- a/src/author_identification.py
+++ b/src/author_identification.py
@ -1,3 +1,4 @@
 import util._hide_sklearn_warnings
 from sklearn.linear_model import LogisticRegression
 from data.dante_loader import load_latin_corpus, list_authors
 from data.features import *
@ -14,6 +15,7 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn
 def main():
    log = open(args.log, 'wt')
    discarded = 0
    f1_scores = []
    counters = []
@ -30,6 +32,7 @@ def main():
        files = np.asarray(pos_files + neg_files)
        if len(positive) < 2:
            discarded += 1
            print(f'discarding analysis for {author} which has only {len(positive)} documents')
            continue
        n_full_docs = len(positive) + len(negative)
@ -53,13 +56,14 @@ def main():
        Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
        print('Fitting the Verificator')
-        av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
+        av = AuthorshipVerificator(nfolds=10)
-        av.fit(Xtr, ytr, groups)
+        av.fit(Xtr, ytr)
        if args.unknown:
            print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
            ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
-            av.predict_proba(ep, args.unknown)
+            pred, _ = av.predict_proba(ep)
            tee(f'{args.unknown}: Posterior probability for {author} is {pred:.3f}', log)
        if args.loo:
            print('Validating the Verificator (Leave-One-Out)')
@ -68,7 +72,7 @@ def main():
            )
            f1_scores.append(f1_from_counters(tp, fp, fn, tn))
            counters.append((tp, fp, fn, tn))
-            print(f'F1 for {author} = {f1_scores[-1]:.3f}')
+            tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log)
    if args.loo:
        print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
@ -78,26 +82,35 @@ def main():
        macro_f1 = f1_scores.mean()
        micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
-        print(f'Macro-F1 = {macro_f1:.3f}')
+        tee(f'LOO Macro-F1 = {macro_f1:.3f}', log)
-        print(f'Micro-F1 = {micro_f1:.3f}')
+        tee(f'LOO Micro-F1 = {micro_f1:.3f}', log)
        print()
    log.close()
 def tee(msg, log):
    print(msg)
    log.write(f'{msg}\n')
    log.flush()
 if __name__ == '__main__':
    import os
    # Training settings
    parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII')
-    parser.add_argument('corpuspath', type=str, metavar='PATH',
+    parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH',
                        help=f'Path to the directory containing the corpus (documents must be named '
-                             f'<author>_<texname>.txt')
+                             f'<author>_<texname>.txt)')
-    parser.add_argument('positive', type=str, default="Dante",
+    parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
                        help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
                              f'every author')
    parser.add_argument('--loo', default=False, action='store_true',
                        help='submit each binary classifier to leave-one-out validation')
    parser.add_argument('--unknown', type=str, metavar='PATH', default=None,
                        help='path to the file of unknown paternity (default None)')
    parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt',
                        help='path to the log file where to write the results (default ./results.txt)')
    args = parser.parse_args()
@ -110,6 +123,7 @@ if __name__ == '__main__':
        args.authors = [args.positive]
    assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.'
-    assert args.unknown is None or os.path.exists(args.unknown), 'unknown file does not exist'
+    assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist'
    assert args.unknown is None or os.path.exists(args.unknown), '"unknown file" does not exist'
    main()
--- a/src/data/features.py
+++ b/src/data/features.py
@ -6,7 +6,7 @@ from sklearn.feature_selection import chi2
 from sklearn.preprocessing import normalize
 from scipy.sparse import hstack, csr_matrix, issparse
 from nltk.corpus import stopwords
-from sklearn.externals.joblib import Parallel, delayed
+from joblib import Parallel, delayed
 latin_function_words = ['et',  'in',  'de',  'ad',  'non',  'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic',
@ -119,7 +119,7 @@ def _features_function_words_freq(documents, lang):
    """
    Extract features as the frequency (L1x1000) of the function words used in the documents
    :param documents: a list where each element is the text (string) of a document
-    :return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
+    :return: a dictionary containing the resulting features, feature names, and taskname
    """
    features = []
    function_words = get_function_words(lang)
@ -134,7 +134,7 @@ def _features_function_words_freq(documents, lang):
    f_names = [f'funcw::{f}' for f in function_words]
    F = np.array(features)
    print(f'task function words (#features={F.shape[1]}) [Done]')
-    return F, f_names
+    return {'features': F, 'f_names':f_names, 'task': 'functionwords'}
 def _features_conjugations_freq(documents, lang):
@ -142,7 +142,7 @@ def _features_conjugations_freq(documents, lang):
    Extract features as the frequency (L1x1000) of the conjugations used in the documents. The method is heuristic, and
    actually searches for suffixes contained in the conjugation list.
    :param documents: a list where each element is the text (string) of a document
-    :return: a np.array of shape (D,F) where D is len(documents) and F is len(conjugations)
+    :return: a dictionary containing the resulting features, feature names, and taskname
    """
    features = []
    conjugations = get_conjugations(lang)
@ -162,7 +162,7 @@ def _features_conjugations_freq(documents, lang):
    f_names = [f'conj::{f}' for f in conjugations]
    F = np.array(features)
    print(f'task conjugation features (#features={F.shape[1]}) [Done]')
-    return F, f_names
+    return {'features': F, 'f_names':f_names, 'task': 'conjugations'}
 def _features_Mendenhall(documents, upto=23):
@ -170,7 +170,7 @@ def _features_Mendenhall(documents, upto=23):
    Extract features as the frequency (L1x1000) of the words' lengths used in the documents,
    following the idea behind Mendenhall's Characteristic Curve of Composition
    :param documents: a list where each element is the text (string) of a document
-    :return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
+    :return: a dictionary containing the resulting features, feature names, and taskname
    """
    features = []
    for text in documents:
@ -185,7 +185,7 @@ def _features_Mendenhall(documents, upto=23):
    f_names = [f'mendenhall::{c}' for c in range(1,upto)]
    F = np.array(features)
    print(f'task Mendenhall features (#features={F.shape[1]}) [Done]')
-    return F, f_names
+    return {'features': F, 'f_names':f_names, 'task': 'Mendenhall'}
 def _features_sentenceLengths(documents, downto=3, upto=70):
@ -194,7 +194,7 @@ def _features_sentenceLengths(documents, downto=3, upto=70):
    :param documents: a list where each element is the text (string) of a document
    :param downto: minimal length considered
    :param upto: maximum length considered
-    :return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
+    :return: a dictionary containing the resulting features, feature names, and taskname
    """
    features = []
    for text in documents:
@ -212,15 +212,14 @@ def _features_sentenceLengths(documents, downto=3, upto=70):
    f_names = [f'sentlength::{c}' for c in range(downto, upto)]
    F = np.array(features)
    print(f'task sentence lengths (#features={F.shape[1]}) [Done]')
-    return F, f_names
+    return {'features': F, 'f_names':f_names, 'task': 'sentlength'}
 def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=1, ngrams=(1, 1)):
    """
    Extract features as tfidf matrix extracted from the documents
    :param documents: a list where each element is the text (string) of a document
-    :return: a tuple M,V, where M is an np.array of shape (D,F), with D being the len(documents) and F the number of
+    :return: a dictionary containing the resulting features, feature names, taskname, the vectorizer and the selector
    distinct words; and V is the TfidfVectorizer already fit
    """
    if vectorizer is None:
        vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams)
@ -238,7 +237,13 @@ def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, fea
        f_names = [f_names[i] for i in selector.get_support(indices=True)]
    print(f'task ngrams and feature selection (#features={features.shape[1]}) [Done]')
-    return features, f_names, vectorizer, selector
+    return {
        'features': features,
        'f_names': f_names,
        'task': '_wngrams_task',
        'vectorizer': vectorizer,
        'selector': selector
    }
 def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=10, preserve_punctuation=True, ngrams=[4, 5]):
@ -253,24 +258,27 @@ def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, fea
    :param vectorizer: the tfidf_vectorizer to use if already fit; if None, a new one will be instantiated and fit
    :param min_df: minumum number of occurrences needed for the ngram to be taken
    :param preserve_punctuation: whether or not to preserve punctuation marks
-    :return: see _features_tfidf
+    :return: a dictionary containing the resulting features, feature names, taskname, the vectorizer and the selector
    """
    doc_ngrams = ngrams_extractor(documents, ngrams, preserve_punctuation)
-    return _features_word_ngrams(
+    outs = _features_word_ngrams(
        doc_ngrams,
        vectorizer=vectorizer,
        selector=selector, y=y, feat_sel_ratio=feat_sel_ratio,
        min_df=min_df
    )
    outs['task'] = '_cngrams_task'
    return outs
 def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True):
-    if not isinstance(ns, list): ns=[ns]
+    if not isinstance(ns, list):
        ns=[ns]
    ns = sorted(np.unique(ns).tolist())
    list_ngrams = []
    for doc in documents:
-        if preserve_punctuation == False:
+        if not preserve_punctuation:
            doc = ' '.join(tokenize(doc))
        doc_ngrams = []
        for ni in ns:
@ -287,7 +295,7 @@ def _feature_selection(X, y, tfidf_feat_selection_ratio):
    :param X: a document by (sparse) features matrix
    :param y: the supervised ndarray containing the class labels
    :param tfidf_feat_selection_ratio: a proportion of features to be taken
-    :return: the reduced matrix and the feature selector fit
+    :return: the feature selector fit
    """
    nF = X.shape[1]
    num_feats = int(tfidf_feat_selection_ratio * nF)
@ -321,7 +329,7 @@ class FeatureExtractor:
                 window_size=5,
                 verbose=True):
        """
-        Applies stlystic feature extraction. Features include:
+        Applies stilystic feature extraction. Features include:
        :param function_words_freq: add the frequency of function words as features
        :param conjugations_freq: add the frequency of regular conjugations as features
        :param features_Mendenhall: add the frequencies of the words' lengths as features
@ -437,113 +445,31 @@ class FeatureExtractor:
            self.feature_names = []
        self.feature_names.extend(feat_names)
    def _transform(self, documents, y=None, fit=False):
        # initialize the document-by-feature vector
        X = np.empty((len(documents), 0))
        # dense feature extraction functions
        if self.function_words_freq:
            F, f_names = _features_function_words_freq(documents, self.function_words_freq)
            X = self._addfeatures(X, F, f_names if fit else None)
            self._print(f'adding function words features: {X.shape[1]} features')
        if self.conjugations_freq:
            F, f_names = _features_conjugations_freq(documents, self.conjugations_freq)
            X = self._addfeatures(X, F, f_names if fit else None)
            self._print(f'adding conjugation features: {X.shape[1]} features')
        if self.features_Mendenhall:
            F, f_names = _features_Mendenhall(documents)
            X = self._addfeatures(X, F, f_names if fit else None)
            self._print(f'adding Mendenhall words features: {X.shape[1]} features')
        if self.features_sentenceLengths:
            F, f_names = _features_sentenceLengths(documents)
            X = self._addfeatures(X, F, f_names if fit else None)
            self._print(f'adding sentence lengths features: {X.shape[1]} features')
        # sparse feature extraction functions
        if self.wngrams:
            if fit:
                X_features, self.wngrams_vectorizer = _features_word_ngrams(documents, ngrams=self.wngrams_range)
                index2word = {i: w for w, i in self.wngrams_vectorizer.vocabulary_.items()}
                f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))]
            else:
                X_features, _ = _features_word_ngrams(documents, self.wngrams_vectorizer)
                f_names = None
            if self.feature_selection_ratio < 1.:
                if self.verbose: print('feature selection')
                if fit:
                    X_features, self.feat_sel_tfidf = _feature_selection(X_features, y, self.feature_selection_ratio)
                    f_names = [f_names[i] for i in self.feat_sel_tfidf.get_support(indices=True)]
                else:
                    X_features = self.feat_sel_tfidf.transform(X_features)
            X = self._addfeatures(_tocsr(X), X_features, f_names)
            self._print(f'adding tfidf words features: {X.shape[1]} features')
        if self.cngrams:
            if fit:
                X_features, self.cngrams_vectorizer = _features_char_ngrams(
                    documents, self.cngrams_range, preserve_punctuation=self.preserve_punctuation
                )
                index2word = {i: w for w, i in self.cngrams_vectorizer.vocabulary_.items()}
                f_names = [f'ngram::{index2word[i]}' for i in range(len(index2word))]
            else:
                X_features, _ = _features_char_ngrams(
                    documents, self.cngrams_range, vectorizer=self.cngrams_vectorizer,
                    preserve_punctuation=self.preserve_punctuation
                )
                f_names = None
            if self.feature_selection_ratio < 1.:
                if self.verbose: print('feature selection')
                if fit:
                    X_features, self.cngrams_selector = _feature_selection(X_features, y, self.feature_selection_ratio)
                    f_names = [f_names[i] for i in self.cngrams_selector.get_support(indices=True)]
                else:
                    X_features = self.cngrams_selector.transform(X_features)
            X = self._addfeatures(_tocsr(X), X_features, f_names)
            self._print(f'adding ngrams character features: {X.shape[1]} features')
        if fit:
            self.feature_names = np.asarray(self.feature_names)
        self._print(f'X shape (#documents,#features): {X.shape}')
        return X
    def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1):
        # initialize the document-by-feature vector
        X = np.empty((len(documents), 0))
        tasks = []
        # dense feature extraction functions
        if self.function_words_freq:
-            tasks.append((_features_function_words_freq, (documents, self.function_words_freq)))
+            tasks.append((_features_function_words_freq, {'documents': documents, 'lang': self.function_words_freq}))
        if self.conjugations_freq:
-            tasks.append((_features_conjugations_freq, (documents, self.conjugations_freq)))
+            tasks.append((_features_conjugations_freq, {'documents': documents, 'lang': self.conjugations_freq}))
        if self.features_Mendenhall:
-            tasks.append((_features_Mendenhall, (documents, 23)))
+            tasks.append((_features_Mendenhall, {'documents': documents, 'upto': 23}))
        if self.features_sentenceLengths:
-            tasks.append((_features_sentenceLengths, (documents, 3, 70)))
+            tasks.append((_features_sentenceLengths, {'documents': documents, 'downto': 3, 'upto': 70}))
        self._print('extracting dense features in parallel')
        outs = Parallel(n_jobs=n_jobs)(delayed(task)(*params) for task, params in tasks)
        for F, feat_names in outs:
            X = self._addfeatures(X, F, feat_names if fit else None)
        # sparse feature extraction functions
        tasks = []
        if self.wngrams:
            if not fit and self.wngrams_vectorizer is None:
                raise ValueError('transform called before fit')
-            params={
+            params = {
                'documents': documents,
                'vectorizer': self.wngrams_vectorizer,
                'selector': self.wngrams_selector,
@ -557,7 +483,7 @@ class FeatureExtractor:
            if not fit and self.cngrams_vectorizer is None:
                raise ValueError('transform called before fit')
-            params={
+            params = {
                'documents': documents,
                'vectorizer': self.cngrams_vectorizer,
                'selector': self.cngrams_selector,
@ -568,15 +494,22 @@ class FeatureExtractor:
            }
            tasks.append((_features_char_ngrams, params))
-        self._print('extracting sparse features in parallel')
+        self._print('extracting features in parallel')
        outs = Parallel(n_jobs=n_jobs)(delayed(task)(**params) for task, params in tasks)
-        for F, feat_names, vectorizer, selector in outs:
+
-            X = self._addfeatures(_tocsr(X), F, feat_names if fit else None)
+        # gather the tasks' outputs
-            if fit:
+        for out in outs:
-                if self.wngrams and self.wngrams_vectorizer is None:
+            taskname = out['task']
-                    self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector
+            if taskname not in {'_wngrams_task', '_cngrams_task'}:
-                elif self.cngrams and self.cngrams_vectorizer is None:
+                X = self._addfeatures(X, out['features'], out['f_names'] if fit else None)
-                    self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector
+            else:
                X = self._addfeatures(_tocsr(X), out['features'], out['f_names'] if fit else None)
                if fit:
                    vectorizer, selector = out['vectorizer'], out['selector']
                    if taskname == '_wngrams_task' and self.wngrams_vectorizer is None:
                        self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector
                    elif taskname == '_cngrams_task' and self.cngrams_vectorizer is None:
                        self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector
        if fit:
            self.feature_names = np.asarray(self.feature_names)
--- a/src/model.py
+++ b/src/model.py
@ -1,6 +1,6 @@
 from sklearn.metrics import make_scorer
 from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
 from sklearn.svm import *
 from data.features import *
 from util.evaluation import f1, get_counters
@ -10,29 +10,21 @@ class AuthorshipVerificator:
    def __init__(self, nfolds=10,
                 params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]},
                 estimator=SVC,
                 author_name=None):
        self.nfolds = nfolds
        self.params = params
        self.author_name = author_name if author_name else 'this author'
-        if estimator is SVC:
+        self.classifier = LogisticRegression()
            self.params['kernel'] = ['linear', 'rbf']
            self.probability = True
            self.classifier = estimator(probability=self.probability)
        elif estimator is LinearSVC:
            self.probability = False
            self.classifier = estimator()
        elif estimator is LogisticRegression:
            self.probability = True
            self.classifier = LogisticRegression()
-    def fit(self,X,y,groups=None):
+    def fit(self, X, y):
-        if not isinstance(y,np.ndarray): y=np.array(y)
+        y = np.asarray(y)
        positive_examples = y.sum()
        if positive_examples >= self.nfolds:
            print('optimizing {}'.format(self.classifier.__class__.__name__))
            folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
-            self.estimator = GridSearchCV(self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
+            self.estimator = GridSearchCV(
                self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1
            )
        else:
            self.estimator = self.classifier
@ -46,7 +38,6 @@ class AuthorshipVerificator:
        return self
    def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
        if groups is None:
            print('Computing LOO without groups')
            folds = list(LeaveOneOut().split(X, y))
@ -59,8 +50,8 @@ class AuthorshipVerificator:
                folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
        scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
-        missclassified = '\n'.join(files[scores==0].tolist())
+        missclassified = '\n'.join(files[scores == 0].tolist())
-        print(scores)
+        print('missclassified texts:')
        print(missclassified)
        if counters and test_lowest_index_only:
@ -73,26 +64,24 @@ class AuthorshipVerificator:
        else:
            return scores.mean(), scores.std()
-    def predict(self, test, epistola_name=''):
+    def predict(self, test):
        pred = self.estimator.predict(test)
        full_doc_prediction = pred[0]
        print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
        if len(pred) > 1:
            fragment_predictions = pred[1:]
            print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
            return full_doc_prediction, fragment_predictions
-        return full_doc_prediction, None
+        return full_doc_prediction
-    def predict_proba(self, test, epistola_name=''):
+    def predict_proba(self, test):
        assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
        pred = self.estimator.predict_proba(test)
        full_doc_prediction = pred[0,1]
        print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
        if len(pred) > 1:
            fragment_predictions = pred[1:,1]
            print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
            return full_doc_prediction, fragment_predictions
-        return full_doc_prediction, None
+        return full_doc_prediction, []
--- a/src/util/_hide_sklearn_warnings.py
+++ b/src/util/_hide_sklearn_warnings.py
@ -0,0 +1,4 @@
 def warn(*args, **kwargs):
    pass
 import warnings
 warnings.warn = warn