From 843cfbe8fe646ceac7042a34252696e814ddedb2 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Fri, 3 Apr 2020 11:21:09 +0200 Subject: [PATCH] feature extraction fully parallelized; result log file added; cleaning --- src/author_identification.py | 34 ++++-- src/data/features.py | 163 +++++++++-------------------- src/model.py | 37 +++---- src/util/_hide_sklearn_warnings.py | 4 + 4 files changed, 89 insertions(+), 149 deletions(-) create mode 100644 src/util/_hide_sklearn_warnings.py diff --git a/src/author_identification.py b/src/author_identification.py index fed7377..5133808 100755 --- a/src/author_identification.py +++ b/src/author_identification.py @@ -1,3 +1,4 @@ +import util._hide_sklearn_warnings from sklearn.linear_model import LogisticRegression from data.dante_loader import load_latin_corpus, list_authors from data.features import * @@ -14,6 +15,7 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn def main(): + log = open(args.log, 'wt') discarded = 0 f1_scores = [] counters = [] @@ -30,6 +32,7 @@ def main(): files = np.asarray(pos_files + neg_files) if len(positive) < 2: discarded += 1 + print(f'discarding analysis for {author} which has only {len(positive)} documents') continue n_full_docs = len(positive) + len(negative) @@ -53,13 +56,14 @@ def main(): Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) print('Fitting the Verificator') - av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression) - av.fit(Xtr, ytr, groups) + av = AuthorshipVerificator(nfolds=10) + av.fit(Xtr, ytr) if args.unknown: print(f'Checking for the hypothesis that {author} was the author of {args.unknown}') ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3) - av.predict_proba(ep, args.unknown) + pred, _ = av.predict_proba(ep) + tee(f'{args.unknown}: Posterior probability for {author} is {pred:.3f}', log) if args.loo: print('Validating the Verificator (Leave-One-Out)') @@ -68,7 +72,7 @@ def main(): ) f1_scores.append(f1_from_counters(tp, fp, fn, tn)) counters.append((tp, fp, fn, tn)) - print(f'F1 for {author} = {f1_scores[-1]:.3f}') + tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log) if args.loo: print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})') @@ -78,26 +82,35 @@ def main(): macro_f1 = f1_scores.mean() micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist()) - print(f'Macro-F1 = {macro_f1:.3f}') - print(f'Micro-F1 = {micro_f1:.3f}') + tee(f'LOO Macro-F1 = {macro_f1:.3f}', log) + tee(f'LOO Micro-F1 = {micro_f1:.3f}', log) print() + log.close() + +def tee(msg, log): + print(msg) + log.write(f'{msg}\n') + log.flush() + if __name__ == '__main__': import os # Training settings parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII') - parser.add_argument('corpuspath', type=str, metavar='PATH', + parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH', help=f'Path to the directory containing the corpus (documents must be named ' - f'_.txt') - parser.add_argument('positive', type=str, default="Dante", + f'_.txt)') + parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR', help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check ' f'every author') parser.add_argument('--loo', default=False, action='store_true', help='submit each binary classifier to leave-one-out validation') parser.add_argument('--unknown', type=str, metavar='PATH', default=None, help='path to the file of unknown paternity (default None)') + parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt', + help='path to the log file where to write the results (default ./results.txt)') args = parser.parse_args() @@ -110,6 +123,7 @@ if __name__ == '__main__': args.authors = [args.positive] assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.' - assert args.unknown is None or os.path.exists(args.unknown), 'unknown file does not exist' + assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist' + assert args.unknown is None or os.path.exists(args.unknown), '"unknown file" does not exist' main() diff --git a/src/data/features.py b/src/data/features.py index c9b25ed..8c9c4c6 100755 --- a/src/data/features.py +++ b/src/data/features.py @@ -6,7 +6,7 @@ from sklearn.feature_selection import chi2 from sklearn.preprocessing import normalize from scipy.sparse import hstack, csr_matrix, issparse from nltk.corpus import stopwords -from sklearn.externals.joblib import Parallel, delayed +from joblib import Parallel, delayed latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic', @@ -119,7 +119,7 @@ def _features_function_words_freq(documents, lang): """ Extract features as the frequency (L1x1000) of the function words used in the documents :param documents: a list where each element is the text (string) of a document - :return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words) + :return: a dictionary containing the resulting features, feature names, and taskname """ features = [] function_words = get_function_words(lang) @@ -134,7 +134,7 @@ def _features_function_words_freq(documents, lang): f_names = [f'funcw::{f}' for f in function_words] F = np.array(features) print(f'task function words (#features={F.shape[1]}) [Done]') - return F, f_names + return {'features': F, 'f_names':f_names, 'task': 'functionwords'} def _features_conjugations_freq(documents, lang): @@ -142,7 +142,7 @@ def _features_conjugations_freq(documents, lang): Extract features as the frequency (L1x1000) of the conjugations used in the documents. The method is heuristic, and actually searches for suffixes contained in the conjugation list. :param documents: a list where each element is the text (string) of a document - :return: a np.array of shape (D,F) where D is len(documents) and F is len(conjugations) + :return: a dictionary containing the resulting features, feature names, and taskname """ features = [] conjugations = get_conjugations(lang) @@ -162,7 +162,7 @@ def _features_conjugations_freq(documents, lang): f_names = [f'conj::{f}' for f in conjugations] F = np.array(features) print(f'task conjugation features (#features={F.shape[1]}) [Done]') - return F, f_names + return {'features': F, 'f_names':f_names, 'task': 'conjugations'} def _features_Mendenhall(documents, upto=23): @@ -170,7 +170,7 @@ def _features_Mendenhall(documents, upto=23): Extract features as the frequency (L1x1000) of the words' lengths used in the documents, following the idea behind Mendenhall's Characteristic Curve of Composition :param documents: a list where each element is the text (string) of a document - :return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered) + :return: a dictionary containing the resulting features, feature names, and taskname """ features = [] for text in documents: @@ -185,7 +185,7 @@ def _features_Mendenhall(documents, upto=23): f_names = [f'mendenhall::{c}' for c in range(1,upto)] F = np.array(features) print(f'task Mendenhall features (#features={F.shape[1]}) [Done]') - return F, f_names + return {'features': F, 'f_names':f_names, 'task': 'Mendenhall'} def _features_sentenceLengths(documents, downto=3, upto=70): @@ -194,7 +194,7 @@ def _features_sentenceLengths(documents, downto=3, upto=70): :param documents: a list where each element is the text (string) of a document :param downto: minimal length considered :param upto: maximum length considered - :return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered) + :return: a dictionary containing the resulting features, feature names, and taskname """ features = [] for text in documents: @@ -212,15 +212,14 @@ def _features_sentenceLengths(documents, downto=3, upto=70): f_names = [f'sentlength::{c}' for c in range(downto, upto)] F = np.array(features) print(f'task sentence lengths (#features={F.shape[1]}) [Done]') - return F, f_names + return {'features': F, 'f_names':f_names, 'task': 'sentlength'} def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=1, ngrams=(1, 1)): """ Extract features as tfidf matrix extracted from the documents :param documents: a list where each element is the text (string) of a document - :return: a tuple M,V, where M is an np.array of shape (D,F), with D being the len(documents) and F the number of - distinct words; and V is the TfidfVectorizer already fit + :return: a dictionary containing the resulting features, feature names, taskname, the vectorizer and the selector """ if vectorizer is None: vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams) @@ -238,7 +237,13 @@ def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, fea f_names = [f_names[i] for i in selector.get_support(indices=True)] print(f'task ngrams and feature selection (#features={features.shape[1]}) [Done]') - return features, f_names, vectorizer, selector + return { + 'features': features, + 'f_names': f_names, + 'task': '_wngrams_task', + 'vectorizer': vectorizer, + 'selector': selector + } def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=10, preserve_punctuation=True, ngrams=[4, 5]): @@ -253,24 +258,27 @@ def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, fea :param vectorizer: the tfidf_vectorizer to use if already fit; if None, a new one will be instantiated and fit :param min_df: minumum number of occurrences needed for the ngram to be taken :param preserve_punctuation: whether or not to preserve punctuation marks - :return: see _features_tfidf + :return: a dictionary containing the resulting features, feature names, taskname, the vectorizer and the selector """ doc_ngrams = ngrams_extractor(documents, ngrams, preserve_punctuation) - return _features_word_ngrams( + outs = _features_word_ngrams( doc_ngrams, vectorizer=vectorizer, selector=selector, y=y, feat_sel_ratio=feat_sel_ratio, min_df=min_df ) + outs['task'] = '_cngrams_task' + return outs def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True): - if not isinstance(ns, list): ns=[ns] + if not isinstance(ns, list): + ns=[ns] ns = sorted(np.unique(ns).tolist()) list_ngrams = [] for doc in documents: - if preserve_punctuation == False: + if not preserve_punctuation: doc = ' '.join(tokenize(doc)) doc_ngrams = [] for ni in ns: @@ -287,7 +295,7 @@ def _feature_selection(X, y, tfidf_feat_selection_ratio): :param X: a document by (sparse) features matrix :param y: the supervised ndarray containing the class labels :param tfidf_feat_selection_ratio: a proportion of features to be taken - :return: the reduced matrix and the feature selector fit + :return: the feature selector fit """ nF = X.shape[1] num_feats = int(tfidf_feat_selection_ratio * nF) @@ -321,7 +329,7 @@ class FeatureExtractor: window_size=5, verbose=True): """ - Applies stlystic feature extraction. Features include: + Applies stilystic feature extraction. Features include: :param function_words_freq: add the frequency of function words as features :param conjugations_freq: add the frequency of regular conjugations as features :param features_Mendenhall: add the frequencies of the words' lengths as features @@ -437,113 +445,31 @@ class FeatureExtractor: self.feature_names = [] self.feature_names.extend(feat_names) - def _transform(self, documents, y=None, fit=False): - # initialize the document-by-feature vector - X = np.empty((len(documents), 0)) - - # dense feature extraction functions - if self.function_words_freq: - F, f_names = _features_function_words_freq(documents, self.function_words_freq) - X = self._addfeatures(X, F, f_names if fit else None) - self._print(f'adding function words features: {X.shape[1]} features') - - if self.conjugations_freq: - F, f_names = _features_conjugations_freq(documents, self.conjugations_freq) - X = self._addfeatures(X, F, f_names if fit else None) - self._print(f'adding conjugation features: {X.shape[1]} features') - - if self.features_Mendenhall: - F, f_names = _features_Mendenhall(documents) - X = self._addfeatures(X, F, f_names if fit else None) - self._print(f'adding Mendenhall words features: {X.shape[1]} features') - - if self.features_sentenceLengths: - F, f_names = _features_sentenceLengths(documents) - X = self._addfeatures(X, F, f_names if fit else None) - self._print(f'adding sentence lengths features: {X.shape[1]} features') - - # sparse feature extraction functions - if self.wngrams: - if fit: - X_features, self.wngrams_vectorizer = _features_word_ngrams(documents, ngrams=self.wngrams_range) - index2word = {i: w for w, i in self.wngrams_vectorizer.vocabulary_.items()} - f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))] - else: - X_features, _ = _features_word_ngrams(documents, self.wngrams_vectorizer) - f_names = None - - if self.feature_selection_ratio < 1.: - if self.verbose: print('feature selection') - if fit: - X_features, self.feat_sel_tfidf = _feature_selection(X_features, y, self.feature_selection_ratio) - f_names = [f_names[i] for i in self.feat_sel_tfidf.get_support(indices=True)] - else: - X_features = self.feat_sel_tfidf.transform(X_features) - X = self._addfeatures(_tocsr(X), X_features, f_names) - self._print(f'adding tfidf words features: {X.shape[1]} features') - - if self.cngrams: - if fit: - X_features, self.cngrams_vectorizer = _features_char_ngrams( - documents, self.cngrams_range, preserve_punctuation=self.preserve_punctuation - ) - index2word = {i: w for w, i in self.cngrams_vectorizer.vocabulary_.items()} - f_names = [f'ngram::{index2word[i]}' for i in range(len(index2word))] - else: - X_features, _ = _features_char_ngrams( - documents, self.cngrams_range, vectorizer=self.cngrams_vectorizer, - preserve_punctuation=self.preserve_punctuation - ) - f_names = None - - if self.feature_selection_ratio < 1.: - if self.verbose: print('feature selection') - if fit: - X_features, self.cngrams_selector = _feature_selection(X_features, y, self.feature_selection_ratio) - f_names = [f_names[i] for i in self.cngrams_selector.get_support(indices=True)] - else: - X_features = self.cngrams_selector.transform(X_features) - - X = self._addfeatures(_tocsr(X), X_features, f_names) - self._print(f'adding ngrams character features: {X.shape[1]} features') - - if fit: - self.feature_names = np.asarray(self.feature_names) - - self._print(f'X shape (#documents,#features): {X.shape}') - - return X - def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1): # initialize the document-by-feature vector X = np.empty((len(documents), 0)) tasks = [] + # dense feature extraction functions if self.function_words_freq: - tasks.append((_features_function_words_freq, (documents, self.function_words_freq))) + tasks.append((_features_function_words_freq, {'documents': documents, 'lang': self.function_words_freq})) if self.conjugations_freq: - tasks.append((_features_conjugations_freq, (documents, self.conjugations_freq))) + tasks.append((_features_conjugations_freq, {'documents': documents, 'lang': self.conjugations_freq})) if self.features_Mendenhall: - tasks.append((_features_Mendenhall, (documents, 23))) + tasks.append((_features_Mendenhall, {'documents': documents, 'upto': 23})) if self.features_sentenceLengths: - tasks.append((_features_sentenceLengths, (documents, 3, 70))) - - self._print('extracting dense features in parallel') - outs = Parallel(n_jobs=n_jobs)(delayed(task)(*params) for task, params in tasks) - for F, feat_names in outs: - X = self._addfeatures(X, F, feat_names if fit else None) + tasks.append((_features_sentenceLengths, {'documents': documents, 'downto': 3, 'upto': 70})) # sparse feature extraction functions - tasks = [] if self.wngrams: if not fit and self.wngrams_vectorizer is None: raise ValueError('transform called before fit') - params={ + params = { 'documents': documents, 'vectorizer': self.wngrams_vectorizer, 'selector': self.wngrams_selector, @@ -557,7 +483,7 @@ class FeatureExtractor: if not fit and self.cngrams_vectorizer is None: raise ValueError('transform called before fit') - params={ + params = { 'documents': documents, 'vectorizer': self.cngrams_vectorizer, 'selector': self.cngrams_selector, @@ -568,15 +494,22 @@ class FeatureExtractor: } tasks.append((_features_char_ngrams, params)) - self._print('extracting sparse features in parallel') + self._print('extracting features in parallel') outs = Parallel(n_jobs=n_jobs)(delayed(task)(**params) for task, params in tasks) - for F, feat_names, vectorizer, selector in outs: - X = self._addfeatures(_tocsr(X), F, feat_names if fit else None) - if fit: - if self.wngrams and self.wngrams_vectorizer is None: - self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector - elif self.cngrams and self.cngrams_vectorizer is None: - self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector + + # gather the tasks' outputs + for out in outs: + taskname = out['task'] + if taskname not in {'_wngrams_task', '_cngrams_task'}: + X = self._addfeatures(X, out['features'], out['f_names'] if fit else None) + else: + X = self._addfeatures(_tocsr(X), out['features'], out['f_names'] if fit else None) + if fit: + vectorizer, selector = out['vectorizer'], out['selector'] + if taskname == '_wngrams_task' and self.wngrams_vectorizer is None: + self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector + elif taskname == '_cngrams_task' and self.cngrams_vectorizer is None: + self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector if fit: self.feature_names = np.asarray(self.feature_names) diff --git a/src/model.py b/src/model.py index 54bfb94..137fbbe 100755 --- a/src/model.py +++ b/src/model.py @@ -1,6 +1,6 @@ from sklearn.metrics import make_scorer from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold -from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.svm import * from data.features import * from util.evaluation import f1, get_counters @@ -10,29 +10,21 @@ class AuthorshipVerificator: def __init__(self, nfolds=10, params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]}, - estimator=SVC, author_name=None): self.nfolds = nfolds self.params = params self.author_name = author_name if author_name else 'this author' - if estimator is SVC: - self.params['kernel'] = ['linear', 'rbf'] - self.probability = True - self.classifier = estimator(probability=self.probability) - elif estimator is LinearSVC: - self.probability = False - self.classifier = estimator() - elif estimator is LogisticRegression: - self.probability = True - self.classifier = LogisticRegression() + self.classifier = LogisticRegression() - def fit(self,X,y,groups=None): - if not isinstance(y,np.ndarray): y=np.array(y) + def fit(self, X, y): + y = np.asarray(y) positive_examples = y.sum() if positive_examples >= self.nfolds: print('optimizing {}'.format(self.classifier.__class__.__name__)) folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y)) - self.estimator = GridSearchCV(self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1) + self.estimator = GridSearchCV( + self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1 + ) else: self.estimator = self.classifier @@ -46,7 +38,6 @@ class AuthorshipVerificator: return self def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False): - if groups is None: print('Computing LOO without groups') folds = list(LeaveOneOut().split(X, y)) @@ -59,8 +50,8 @@ class AuthorshipVerificator: folds = [(train, np.min(test, keepdims=True)) for train, test in folds] scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1) - missclassified = '\n'.join(files[scores==0].tolist()) - print(scores) + missclassified = '\n'.join(files[scores == 0].tolist()) + print('missclassified texts:') print(missclassified) if counters and test_lowest_index_only: @@ -73,26 +64,24 @@ class AuthorshipVerificator: else: return scores.mean(), scores.std() - def predict(self, test, epistola_name=''): + def predict(self, test): pred = self.estimator.predict(test) full_doc_prediction = pred[0] - print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No')) if len(pred) > 1: fragment_predictions = pred[1:] print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) return full_doc_prediction, fragment_predictions - return full_doc_prediction, None + return full_doc_prediction - def predict_proba(self, test, epistola_name=''): + def predict_proba(self, test): assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated' pred = self.estimator.predict_proba(test) full_doc_prediction = pred[0,1] - print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}') if len(pred) > 1: fragment_predictions = pred[1:,1] print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) return full_doc_prediction, fragment_predictions - return full_doc_prediction, None + return full_doc_prediction, [] diff --git a/src/util/_hide_sklearn_warnings.py b/src/util/_hide_sklearn_warnings.py new file mode 100644 index 0000000..02fb2de --- /dev/null +++ b/src/util/_hide_sklearn_warnings.py @@ -0,0 +1,4 @@ +def warn(*args, **kwargs): + pass +import warnings +warnings.warn = warn