import nltk import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.preprocessing import normalize from scipy.sparse import hstack, csr_matrix, issparse from nltk.corpus import stopwords from sklearn.externals.joblib import Parallel, delayed latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic', 'si', 'etiam', 'idest', 'nam', 'unde', 'ab', 'uel', 'sicut', 'ita', 'enim', 'scilicet', 'nec', 'pro', 'autem', 'ibi', 'dum', 'uero', 'tamen', 'inter', 'ideo', 'propter', 'contra', 'sub', 'quomodo', 'ubi', 'super', 'iam', 'tam', 'hec', 'post', 'quasi', 'ergo', 'inde', 'e', 'tunc', 'atque', 'ac', 'sine', 'nisi', 'nunc', 'quando', 'ne', 'usque', 'siue', 'aut', 'igitur', 'circa', 'quidem', 'supra', 'ante', 'adhuc', 'seu' , 'apud', 'olim', 'statim', 'satis', 'ob', 'quoniam', 'postea', 'nunquam'] latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amus', 'emus', 'imus', 'atis', 'etis', 'itis', 'ant', 'ent', 'unt', 'iunt', 'or', 'eor', 'ior', 'aris', 'eris', 'iris', 'atur', 'etur', 'itur', 'amur', 'emur', 'imur', 'amini', 'emini', 'imini', 'antur', 'entur', 'untur', 'iuntur', 'abam', 'ebam', 'iebam', 'abas', 'ebas', 'iebas', 'abat', 'ebat', 'iebat', 'abamus', 'ebamus', 'iebamus', 'abatis', 'ebatis', 'iebatis', 'abant', 'ebant', 'iebant', 'abar', 'ebar', 'iebar', 'abaris', 'ebaris', 'iebaris', 'abatur', 'ebatur', 'iebatur', 'abamur', 'ebamur', 'iebamur', 'abamini', 'ebamini', 'iebamini', 'abantur', 'ebantur', 'iebantur', 'abo', 'ebo', 'am', 'iam', 'abis', 'ebis', 'ies', 'abit', 'ebit', 'iet', 'abimus', 'ebimus', 'emus', 'iemus', 'abitis', 'ebitis', 'ietis', 'abunt', 'ebunt', 'ient', 'abor', 'ebor', 'ar', 'iar', 'aberis', 'eberis', 'ieris', 'abitur', 'ebitur', 'ietur', 'abimur', 'ebimur', 'iemur', 'abimini', 'ebimini', 'iemini', 'abuntur', 'ebuntur', 'ientur', 'i', 'isti', 'it', 'istis', 'erunt', 'em', 'eam', 'eas', 'ias', 'eat', 'iat', 'eamus', 'iamus', 'eatis', 'iatis', 'eant', 'iant', 'er', 'ear', 'earis', 'iaris', 'eatur', 'iatur', 'eamur', 'iamur', 'eamini', 'iamini', 'eantur', 'iantur', 'rem', 'res', 'ret', 'remus', 'retis', 'rent', 'rer', 'reris', 'retur', 'remur', 'remini', 'rentur', 'erim', 'issem', 'isses', 'isset', 'issemus', 'issetis', 'issent', 'a', 'ate', 'e', 'ete', 'ite', 'are', 'ere', 'ire', 'ato', 'eto', 'ito', 'atote', 'etote', 'itote', 'anto', 'ento', 'unto', 'iunto', 'ator', 'etor', 'itor', 'aminor', 'eminor', 'iminor', 'antor', 'entor', 'untor', 'iuntor', 'ari', 'eri', 'iri', 'andi', 'ando', 'andum', 'andus', 'ande', 'ans', 'antis', 'anti', 'antem', 'antes', 'antium', 'antibus', 'antia', 'esse', 'sum', 'est', 'sumus', 'estis', 'sunt', 'eram', 'eras', 'erat', 'eramus', 'eratis', 'erant', 'ero', 'eris', 'erit', 'erimus', 'eritis', 'erint', 'sim', 'sis', 'sit', 'simus', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemus', 'essetis', 'essent', 'fui', 'fuisti', 'fuit', 'fuimus', 'fuistis', 'fuerunt', 'este', 'esto', 'estote', 'sunto'] def get_function_words(lang): if lang == 'latin': return latin_function_words elif lang in ['english','spanish']: return stopwords.words(lang) else: raise ValueError('{} not in scope!'.format(lang)) def get_conjugations(lang): if lang == 'latin': return latin_conjugations else: raise ValueError('conjugations for languages other than Latin are not yet supported') # ------------------------------------------------------------------------ # split policies # ------------------------------------------------------------------------ def split_by_endline(text): return [t.strip() for t in text.split('\n') if t.strip()] def split_by_sentences(text): sentences = [t.strip() for t in nltk.tokenize.sent_tokenize(text) if t.strip()] for i,sentence in enumerate(sentences): unmod_tokens = nltk.tokenize.word_tokenize(sentence) mod_tokens = ([token for token in unmod_tokens if any(char.isalpha() for char in token)]) if len(mod_tokens)<8: if i < len(sentences)-1: sentences[i+1] = sentences[i] + ' ' + sentences[i+1] else: sentences[i-1] = sentences[i-1] + ' ' + sentences[i] sentences.pop(i) return sentences def windows(text_fragments, window_size): new_fragments = [] nbatches = len(text_fragments) // window_size if len(text_fragments) % window_size > 0: nbatches += 1 for i in range(nbatches): offset = i*window_size new_fragments.append(' '.join(text_fragments[offset:offset+window_size])) return new_fragments def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1): fragments = [] authors_fragments = [] groups = [] for i, text in enumerate(documents): text_fragments = split_policy(text) text_fragments = windows(text_fragments, window_size=window_size) fragments.extend(text_fragments) groups.extend([i]*len(text_fragments)) if authors is not None: authors_fragments.extend([authors[i]] * len(text_fragments)) if authors is not None: return fragments, authors_fragments, groups return fragments, groups def tokenize(text): unmod_tokens = nltk.word_tokenize(text) return [token.lower() for token in unmod_tokens if any(char.isalpha() for char in token)] # ------------------------------------------------------------------------ # feature extraction methods # ------------------------------------------------------------------------ def _features_function_words_freq(documents, lang): """ Extract features as the frequency (L1x1000) of the function words used in the documents :param documents: a list where each element is the text (string) of a document :return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words) """ features = [] function_words = get_function_words(lang) for text in documents: mod_tokens = tokenize(text) freqs = nltk.FreqDist(mod_tokens) nwords = len(mod_tokens) funct_words_freq = [1000. * freqs[function_word] / nwords for function_word in function_words] features.append(funct_words_freq) f_names = [f'funcw::{f}' for f in function_words] F = np.array(features) print(f'task function words (#features={F.shape[1]}) [Done]') return F, f_names def _features_conjugations_freq(documents, lang): """ Extract features as the frequency (L1x1000) of the conjugations used in the documents. The method is heuristic, and actually searches for suffixes contained in the conjugation list. :param documents: a list where each element is the text (string) of a document :return: a np.array of shape (D,F) where D is len(documents) and F is len(conjugations) """ features = [] conjugations = get_conjugations(lang) for text in documents: mod_tokens = tokenize(text) conjugation_tokens = [] for conjugation in conjugations: conjugation_tokens.extend( [conjugation for token in mod_tokens if token.endswith(conjugation) and len(token) > len(conjugation)] ) freqs = nltk.FreqDist(conjugation_tokens) nwords = len(mod_tokens) conjugation_freq = [1000. * freqs[conjugation] / nwords for conjugation in conjugations] features.append(conjugation_freq) f_names = [f'conj::{f}' for f in conjugations] F = np.array(features) print(f'task conjugation features (#features={F.shape[1]}) [Done]') return F, f_names def _features_Mendenhall(documents, upto=23): """ Extract features as the frequency (L1x1000) of the words' lengths used in the documents, following the idea behind Mendenhall's Characteristic Curve of Composition :param documents: a list where each element is the text (string) of a document :return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered) """ features = [] for text in documents: mod_tokens = tokenize(text) nwords = len(mod_tokens) tokens_len = [len(token) for token in mod_tokens] tokens_count = [] for i in range(1, upto): tokens_count.append(1000.*(sum(j>= i for j in tokens_len))/nwords) features.append(tokens_count) f_names = [f'mendenhall::{c}' for c in range(1,upto)] F = np.array(features) print(f'task Mendenhall features (#features={F.shape[1]}) [Done]') return F, f_names def _features_sentenceLengths(documents, downto=3, upto=70): """ Extract features as the length of the sentences, ie. number of words in the sentence. :param documents: a list where each element is the text (string) of a document :param downto: minimal length considered :param upto: maximum length considered :return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered) """ features = [] for text in documents: sentences = [t.strip() for t in nltk.tokenize.sent_tokenize(text) if t.strip()] nsent = len(sentences) sent_len = [] sent_count = [] for sentence in sentences: mod_tokens = tokenize(sentence) sent_len.append(len(mod_tokens)) for i in range(downto, upto): sent_count.append(1000.*(sum(j>= i for j in sent_len))/nsent) features.append(sent_count) f_names = [f'sentlength::{c}' for c in range(downto, upto)] F = np.array(features) print(f'task sentence lengths (#features={F.shape[1]}) [Done]') return F, f_names def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=1, ngrams=(1, 1)): """ Extract features as tfidf matrix extracted from the documents :param documents: a list where each element is the text (string) of a document :return: a tuple M,V, where M is an np.array of shape (D,F), with D being the len(documents) and F the number of distinct words; and V is the TfidfVectorizer already fit """ if vectorizer is None: vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams) vectorizer.fit(documents) features = vectorizer.transform(documents) index2word = {i: w for w, i in vectorizer.vocabulary_.items()} f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))] if feat_sel_ratio < 1.: if selector is None: selector = _feature_selection(features, y, feat_sel_ratio) features = selector.transform(features) f_names = [f_names[i] for i in selector.get_support(indices=True)] print(f'task ngrams and feature selection (#features={features.shape[1]}) [Done]') return features, f_names, vectorizer, selector def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=10, preserve_punctuation=True, ngrams=[4, 5]): """ Extract char-ngrams This implementation is generic, following Sapkota et al. (ref [39] in the PAN 2015 overview), i.e., containing punctuation marks. However, this does not apply to Latin texts in which punctuation marks are filtered-out. More recently, it was shown that character n-grams corresponding to word affixes and including punctuation marks are the most significant features in cross-topic authorship attribution [57]. :param documents: a list where each element is the text (string) of a document :param ns: the lenghts (n) for which n-gram frequencies will be computed :param vectorizer: the tfidf_vectorizer to use if already fit; if None, a new one will be instantiated and fit :param min_df: minumum number of occurrences needed for the ngram to be taken :param preserve_punctuation: whether or not to preserve punctuation marks :return: see _features_tfidf """ doc_ngrams = ngrams_extractor(documents, ngrams, preserve_punctuation) return _features_word_ngrams( doc_ngrams, vectorizer=vectorizer, selector=selector, y=y, feat_sel_ratio=feat_sel_ratio, min_df=min_df ) def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True): if not isinstance(ns, list): ns=[ns] ns = sorted(np.unique(ns).tolist()) list_ngrams = [] for doc in documents: if preserve_punctuation == False: doc = ' '.join(tokenize(doc)) doc_ngrams = [] for ni in ns: doc_ngrams.extend([doc[i:i + ni].replace(' ', '_') for i in range(len(doc) - ni + 1)]) list_ngrams.append(' '.join(doc_ngrams)) return list_ngrams def _feature_selection(X, y, tfidf_feat_selection_ratio): """ Filter-style feature selection based on Chi-squared as the term selection reduction function :param X: a document by (sparse) features matrix :param y: the supervised ndarray containing the class labels :param tfidf_feat_selection_ratio: a proportion of features to be taken :return: the reduced matrix and the feature selector fit """ nF = X.shape[1] num_feats = int(tfidf_feat_selection_ratio * nF) feature_selector = SelectKBest(chi2, k=num_feats) return feature_selector.fit(X, y) def _tocsr(X): """ Converts a dense matrix into a sparse one """ return X if issparse(X) else csr_matrix(X) class FeatureExtractor: """ A feature extractor for authorship analysis applications implemented as a transformer """ def __init__(self, function_words_freq=None, conjugations_freq=None, features_Mendenhall=True, features_sentenceLengths=True, wordngrams=False, feature_selection_ratio=1., n_wordngrams=(1, 1), charngrams=False, n_charngrams=[4, 5], preserve_punctuation=True, split_documents=False, split_policy=split_by_endline, normalize_features=True, window_size=5, verbose=True): """ Applies stlystic feature extraction. Features include: :param function_words_freq: add the frequency of function words as features :param conjugations_freq: add the frequency of regular conjugations as features :param features_Mendenhall: add the frequencies of the words' lengths as features :param features_sentenceLengths: add the frequencies of the sentences' lengths as features :param wordngrams: add the words tfidf as features :param feature_selection_ratio: if less than 1, indicates the ratio of most important features (according to chi-squared test) to be selected :param n_wordngrams: a tuple (min,max) indicating the range of lengths for word n-grams :param charngrams: add the char n-grams tfidf as features :param n_charngrams: a tuple (min,max) indicating the range of lengths for char n-grams :param preserve_punctuation: whether or not to preserver punctuation marks (should be deactivated for medieval Latin texts) :param split_documents: whether to split text into smaller documents or not (currently, the policy is to split by '\n'). Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the full documents, which are anyway retained). :param split_policy: a callable that implements the split to be applied (ignored if split_documents=False) :param window_size: the size of the window in case of sliding windows policy :param verbose: show information by stdout or not """ self.function_words_freq = function_words_freq self.conjugations_freq = conjugations_freq self.features_Mendenhall = features_Mendenhall self.features_sentenceLengths = features_sentenceLengths self.wngrams = wordngrams self.feature_selection_ratio = feature_selection_ratio self.wngrams_range = n_wordngrams self.cngrams = charngrams self.cngrams_range = n_charngrams self.preserve_punctuation = preserve_punctuation self.split_documents = split_documents self.split_policy = split_policy self.normalize_features = normalize_features self.window_size = window_size self.verbose = verbose self.feature_names = None self.wngrams_vectorizer = self.wngrams_selector = None self.cngrams_vectorizer = self.cngrams_selector = None def fit_transform(self, positives, negatives): documents = positives + negatives authors = [1]*len(positives) + [0]*len(negatives) n_original_docs = len(documents) groups = list(range(n_original_docs)) if self.split_documents: doc_fragments, authors_fragments, groups_fragments = splitter( documents, authors, split_policy=self.split_policy, window_size=self.window_size ) documents.extend(doc_fragments) authors.extend(authors_fragments) groups.extend(groups_fragments) self._print(f'splitting documents: {len(doc_fragments)} segments + ' f'{n_original_docs} documents = ' f'{len(documents)} total') # represent the target vector y = np.array(authors) groups = np.array(groups) X = self._transform_parallel(documents, y, fit=True) if self.verbose: print( f'load_documents: function_words_freq={self.function_words_freq} ' f'features_Mendenhall={self.features_Mendenhall} tfidf={self.wngrams} ' f'split_documents={self.split_documents}, split_policy={self.split_policy.__name__}' ) print(f'number of training (full) documents: {n_original_docs}') print(f'y prevalence: {y.sum()}/{len(y)} {y.mean() * 100:.2f}%') print() return X, y, groups def transform(self, test, return_fragments=False, window_size=-1, avoid_splitting=False): if isinstance(test, str): test = [test] if window_size == -1: window_size = self.window_size if self.split_documents and not avoid_splitting: tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size) test.extend(tests) old_verbose = self.verbose self.verbose = False TEST = self._transform_parallel(test, fit=False) self.verbose = old_verbose if return_fragments: return TEST, test[1:] else: return TEST def _addfeatures(self, X, F, feat_names=None): if self.normalize_features: normalize(F, axis=1, copy=False) self._register_feature_names(feat_names) if issparse(F): return hstack((X, F)) # sparse else: return np.hstack((X, F)) # dense def _print(self, msg): if self.verbose: print(msg) def _register_feature_names(self, feat_names): """ keeps track of the feature names (for debugging and analysis) """ if feat_names is None: return if self.feature_names is None: self.feature_names = [] self.feature_names.extend(feat_names) def _transform(self, documents, y=None, fit=False): # initialize the document-by-feature vector X = np.empty((len(documents), 0)) # dense feature extraction functions if self.function_words_freq: F, f_names = _features_function_words_freq(documents, self.function_words_freq) X = self._addfeatures(X, F, f_names if fit else None) self._print(f'adding function words features: {X.shape[1]} features') if self.conjugations_freq: F, f_names = _features_conjugations_freq(documents, self.conjugations_freq) X = self._addfeatures(X, F, f_names if fit else None) self._print(f'adding conjugation features: {X.shape[1]} features') if self.features_Mendenhall: F, f_names = _features_Mendenhall(documents) X = self._addfeatures(X, F, f_names if fit else None) self._print(f'adding Mendenhall words features: {X.shape[1]} features') if self.features_sentenceLengths: F, f_names = _features_sentenceLengths(documents) X = self._addfeatures(X, F, f_names if fit else None) self._print(f'adding sentence lengths features: {X.shape[1]} features') # sparse feature extraction functions if self.wngrams: if fit: X_features, self.wngrams_vectorizer = _features_word_ngrams(documents, ngrams=self.wngrams_range) index2word = {i: w for w, i in self.wngrams_vectorizer.vocabulary_.items()} f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))] else: X_features, _ = _features_word_ngrams(documents, self.wngrams_vectorizer) f_names = None if self.feature_selection_ratio < 1.: if self.verbose: print('feature selection') if fit: X_features, self.feat_sel_tfidf = _feature_selection(X_features, y, self.feature_selection_ratio) f_names = [f_names[i] for i in self.feat_sel_tfidf.get_support(indices=True)] else: X_features = self.feat_sel_tfidf.transform(X_features) X = self._addfeatures(_tocsr(X), X_features, f_names) self._print(f'adding tfidf words features: {X.shape[1]} features') if self.cngrams: if fit: X_features, self.cngrams_vectorizer = _features_char_ngrams( documents, self.cngrams_range, preserve_punctuation=self.preserve_punctuation ) index2word = {i: w for w, i in self.cngrams_vectorizer.vocabulary_.items()} f_names = [f'ngram::{index2word[i]}' for i in range(len(index2word))] else: X_features, _ = _features_char_ngrams( documents, self.cngrams_range, vectorizer=self.cngrams_vectorizer, preserve_punctuation=self.preserve_punctuation ) f_names = None if self.feature_selection_ratio < 1.: if self.verbose: print('feature selection') if fit: X_features, self.cngrams_selector = _feature_selection(X_features, y, self.feature_selection_ratio) f_names = [f_names[i] for i in self.cngrams_selector.get_support(indices=True)] else: X_features = self.cngrams_selector.transform(X_features) X = self._addfeatures(_tocsr(X), X_features, f_names) self._print(f'adding ngrams character features: {X.shape[1]} features') if fit: self.feature_names = np.asarray(self.feature_names) self._print(f'X shape (#documents,#features): {X.shape}') return X def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1): # initialize the document-by-feature vector X = np.empty((len(documents), 0)) tasks = [] # dense feature extraction functions if self.function_words_freq: tasks.append((_features_function_words_freq, (documents, self.function_words_freq))) if self.conjugations_freq: tasks.append((_features_conjugations_freq, (documents, self.conjugations_freq))) if self.features_Mendenhall: tasks.append((_features_Mendenhall, (documents, 23))) if self.features_sentenceLengths: tasks.append((_features_sentenceLengths, (documents, 3, 70))) self._print('extracting dense features in parallel') outs = Parallel(n_jobs=n_jobs)(delayed(task)(*params) for task, params in tasks) for F, feat_names in outs: X = self._addfeatures(X, F, feat_names if fit else None) # sparse feature extraction functions tasks = [] if self.wngrams: if not fit and self.wngrams_vectorizer is None: raise ValueError('transform called before fit') params={ 'documents': documents, 'vectorizer': self.wngrams_vectorizer, 'selector': self.wngrams_selector, 'y': y, 'feat_sel_ratio': self.feature_selection_ratio, 'ngrams': self.wngrams_range } tasks.append((_features_word_ngrams, params)) if self.cngrams: if not fit and self.cngrams_vectorizer is None: raise ValueError('transform called before fit') params={ 'documents': documents, 'vectorizer': self.cngrams_vectorizer, 'selector': self.cngrams_selector, 'y': y, 'feat_sel_ratio': self.feature_selection_ratio, 'ngrams': self.cngrams_range, 'preserve_punctuation': self.preserve_punctuation } tasks.append((_features_char_ngrams, params)) self._print('extracting sparse features in parallel') outs = Parallel(n_jobs=n_jobs)(delayed(task)(**params) for task, params in tasks) for F, feat_names, vectorizer, selector in outs: X = self._addfeatures(_tocsr(X), F, feat_names if fit else None) if fit: if self.wngrams and self.wngrams_vectorizer is None: self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector elif self.cngrams and self.cngrams_vectorizer is None: self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector if fit: self.feature_names = np.asarray(self.feature_names) self._print(f'X shape (#documents,#features): {X.shape}') return X