586 lines
26 KiB
Python
Executable File
586 lines
26 KiB
Python
Executable File
import nltk
|
|
import numpy as np
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.feature_selection import SelectKBest
|
|
from sklearn.feature_selection import chi2
|
|
from sklearn.preprocessing import normalize
|
|
from scipy.sparse import hstack, csr_matrix, issparse
|
|
from nltk.corpus import stopwords
|
|
from sklearn.externals.joblib import Parallel, delayed
|
|
|
|
|
|
latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic',
|
|
'si', 'etiam', 'idest', 'nam', 'unde', 'ab', 'uel', 'sicut', 'ita', 'enim', 'scilicet', 'nec',
|
|
'pro', 'autem', 'ibi', 'dum', 'uero', 'tamen', 'inter', 'ideo', 'propter', 'contra', 'sub',
|
|
'quomodo', 'ubi', 'super', 'iam', 'tam', 'hec', 'post', 'quasi', 'ergo', 'inde', 'e', 'tunc',
|
|
'atque', 'ac', 'sine', 'nisi', 'nunc', 'quando', 'ne', 'usque', 'siue', 'aut', 'igitur', 'circa',
|
|
'quidem', 'supra', 'ante', 'adhuc', 'seu' , 'apud', 'olim', 'statim', 'satis', 'ob', 'quoniam',
|
|
'postea', 'nunquam']
|
|
|
|
latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amus', 'emus', 'imus', 'atis', 'etis',
|
|
'itis', 'ant', 'ent', 'unt', 'iunt', 'or', 'eor', 'ior', 'aris', 'eris', 'iris', 'atur', 'etur',
|
|
'itur', 'amur', 'emur', 'imur', 'amini', 'emini', 'imini', 'antur', 'entur', 'untur', 'iuntur',
|
|
'abam', 'ebam', 'iebam', 'abas', 'ebas', 'iebas', 'abat', 'ebat', 'iebat', 'abamus', 'ebamus',
|
|
'iebamus', 'abatis', 'ebatis', 'iebatis', 'abant', 'ebant', 'iebant', 'abar', 'ebar', 'iebar',
|
|
'abaris', 'ebaris', 'iebaris', 'abatur', 'ebatur', 'iebatur', 'abamur', 'ebamur', 'iebamur',
|
|
'abamini', 'ebamini', 'iebamini', 'abantur', 'ebantur', 'iebantur', 'abo', 'ebo', 'am', 'iam',
|
|
'abis', 'ebis', 'ies', 'abit', 'ebit', 'iet', 'abimus', 'ebimus', 'emus', 'iemus', 'abitis',
|
|
'ebitis', 'ietis', 'abunt', 'ebunt', 'ient', 'abor', 'ebor', 'ar', 'iar', 'aberis', 'eberis',
|
|
'ieris', 'abitur', 'ebitur', 'ietur', 'abimur', 'ebimur', 'iemur', 'abimini', 'ebimini', 'iemini',
|
|
'abuntur', 'ebuntur', 'ientur', 'i', 'isti', 'it', 'istis', 'erunt', 'em', 'eam', 'eas',
|
|
'ias', 'eat', 'iat', 'eamus', 'iamus', 'eatis', 'iatis', 'eant', 'iant', 'er', 'ear', 'earis',
|
|
'iaris', 'eatur', 'iatur', 'eamur', 'iamur', 'eamini', 'iamini', 'eantur', 'iantur', 'rem', 'res',
|
|
'ret', 'remus', 'retis', 'rent', 'rer', 'reris', 'retur', 'remur', 'remini', 'rentur', 'erim',
|
|
'issem', 'isses', 'isset', 'issemus', 'issetis', 'issent', 'a', 'ate', 'e', 'ete', 'ite', 'are',
|
|
'ere', 'ire', 'ato', 'eto', 'ito', 'atote', 'etote', 'itote', 'anto', 'ento', 'unto', 'iunto',
|
|
'ator', 'etor', 'itor', 'aminor', 'eminor', 'iminor', 'antor', 'entor', 'untor', 'iuntor', 'ari',
|
|
'eri', 'iri', 'andi', 'ando', 'andum', 'andus', 'ande', 'ans', 'antis', 'anti', 'antem', 'antes',
|
|
'antium', 'antibus', 'antia', 'esse', 'sum', 'est', 'sumus', 'estis', 'sunt', 'eram', 'eras',
|
|
'erat', 'eramus', 'eratis', 'erant', 'ero', 'eris', 'erit', 'erimus', 'eritis', 'erint', 'sim',
|
|
'sis', 'sit', 'simus', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemus', 'essetis', 'essent',
|
|
'fui', 'fuisti', 'fuit', 'fuimus', 'fuistis', 'fuerunt', 'este', 'esto', 'estote', 'sunto']
|
|
|
|
|
|
def get_function_words(lang):
|
|
if lang == 'latin':
|
|
return latin_function_words
|
|
elif lang in ['english','spanish']:
|
|
return stopwords.words(lang)
|
|
else:
|
|
raise ValueError('{} not in scope!'.format(lang))
|
|
|
|
|
|
def get_conjugations(lang):
|
|
if lang == 'latin':
|
|
return latin_conjugations
|
|
else:
|
|
raise ValueError('conjugations for languages other than Latin are not yet supported')
|
|
|
|
|
|
# ------------------------------------------------------------------------
|
|
# split policies
|
|
# ------------------------------------------------------------------------
|
|
def split_by_endline(text):
|
|
return [t.strip() for t in text.split('\n') if t.strip()]
|
|
|
|
|
|
def split_by_sentences(text):
|
|
sentences = [t.strip() for t in nltk.tokenize.sent_tokenize(text) if t.strip()]
|
|
for i,sentence in enumerate(sentences):
|
|
unmod_tokens = nltk.tokenize.word_tokenize(sentence)
|
|
mod_tokens = ([token for token in unmod_tokens if any(char.isalpha() for char in token)])
|
|
if len(mod_tokens)<8:
|
|
if i < len(sentences)-1:
|
|
sentences[i+1] = sentences[i] + ' ' + sentences[i+1]
|
|
else:
|
|
sentences[i-1] = sentences[i-1] + ' ' + sentences[i]
|
|
sentences.pop(i)
|
|
return sentences
|
|
|
|
|
|
def windows(text_fragments, window_size):
|
|
new_fragments = []
|
|
nbatches = len(text_fragments) // window_size
|
|
if len(text_fragments) % window_size > 0:
|
|
nbatches += 1
|
|
for i in range(nbatches):
|
|
offset = i*window_size
|
|
new_fragments.append(' '.join(text_fragments[offset:offset+window_size]))
|
|
return new_fragments
|
|
|
|
|
|
def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1):
|
|
fragments = []
|
|
authors_fragments = []
|
|
groups = []
|
|
for i, text in enumerate(documents):
|
|
text_fragments = split_policy(text)
|
|
text_fragments = windows(text_fragments, window_size=window_size)
|
|
fragments.extend(text_fragments)
|
|
groups.extend([i]*len(text_fragments))
|
|
if authors is not None:
|
|
authors_fragments.extend([authors[i]] * len(text_fragments))
|
|
|
|
if authors is not None:
|
|
return fragments, authors_fragments, groups
|
|
|
|
return fragments, groups
|
|
|
|
|
|
def tokenize(text):
|
|
unmod_tokens = nltk.word_tokenize(text)
|
|
return [token.lower() for token in unmod_tokens if any(char.isalpha() for char in token)]
|
|
|
|
|
|
# ------------------------------------------------------------------------
|
|
# feature extraction methods
|
|
# ------------------------------------------------------------------------
|
|
def _features_function_words_freq(documents, lang):
|
|
"""
|
|
Extract features as the frequency (L1x1000) of the function words used in the documents
|
|
:param documents: a list where each element is the text (string) of a document
|
|
:return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
|
|
"""
|
|
features = []
|
|
function_words = get_function_words(lang)
|
|
|
|
for text in documents:
|
|
mod_tokens = tokenize(text)
|
|
freqs = nltk.FreqDist(mod_tokens)
|
|
nwords = len(mod_tokens)
|
|
funct_words_freq = [1000. * freqs[function_word] / nwords for function_word in function_words]
|
|
features.append(funct_words_freq)
|
|
|
|
f_names = [f'funcw::{f}' for f in function_words]
|
|
F = np.array(features)
|
|
print(f'task function words (#features={F.shape[1]}) [Done]')
|
|
return F, f_names
|
|
|
|
|
|
def _features_conjugations_freq(documents, lang):
|
|
"""
|
|
Extract features as the frequency (L1x1000) of the conjugations used in the documents. The method is heuristic, and
|
|
actually searches for suffixes contained in the conjugation list.
|
|
:param documents: a list where each element is the text (string) of a document
|
|
:return: a np.array of shape (D,F) where D is len(documents) and F is len(conjugations)
|
|
"""
|
|
features = []
|
|
conjugations = get_conjugations(lang)
|
|
|
|
for text in documents:
|
|
mod_tokens = tokenize(text)
|
|
conjugation_tokens = []
|
|
for conjugation in conjugations:
|
|
conjugation_tokens.extend(
|
|
[conjugation for token in mod_tokens if token.endswith(conjugation) and len(token) > len(conjugation)]
|
|
)
|
|
freqs = nltk.FreqDist(conjugation_tokens)
|
|
nwords = len(mod_tokens)
|
|
conjugation_freq = [1000. * freqs[conjugation] / nwords for conjugation in conjugations]
|
|
features.append(conjugation_freq)
|
|
|
|
f_names = [f'conj::{f}' for f in conjugations]
|
|
F = np.array(features)
|
|
print(f'task conjugation features (#features={F.shape[1]}) [Done]')
|
|
return F, f_names
|
|
|
|
|
|
def _features_Mendenhall(documents, upto=23):
|
|
"""
|
|
Extract features as the frequency (L1x1000) of the words' lengths used in the documents,
|
|
following the idea behind Mendenhall's Characteristic Curve of Composition
|
|
:param documents: a list where each element is the text (string) of a document
|
|
:return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
|
|
"""
|
|
features = []
|
|
for text in documents:
|
|
mod_tokens = tokenize(text)
|
|
nwords = len(mod_tokens)
|
|
tokens_len = [len(token) for token in mod_tokens]
|
|
tokens_count = []
|
|
for i in range(1, upto):
|
|
tokens_count.append(1000.*(sum(j>= i for j in tokens_len))/nwords)
|
|
features.append(tokens_count)
|
|
|
|
f_names = [f'mendenhall::{c}' for c in range(1,upto)]
|
|
F = np.array(features)
|
|
print(f'task Mendenhall features (#features={F.shape[1]}) [Done]')
|
|
return F, f_names
|
|
|
|
|
|
def _features_sentenceLengths(documents, downto=3, upto=70):
|
|
"""
|
|
Extract features as the length of the sentences, ie. number of words in the sentence.
|
|
:param documents: a list where each element is the text (string) of a document
|
|
:param downto: minimal length considered
|
|
:param upto: maximum length considered
|
|
:return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
|
|
"""
|
|
features = []
|
|
for text in documents:
|
|
sentences = [t.strip() for t in nltk.tokenize.sent_tokenize(text) if t.strip()]
|
|
nsent = len(sentences)
|
|
sent_len = []
|
|
sent_count = []
|
|
for sentence in sentences:
|
|
mod_tokens = tokenize(sentence)
|
|
sent_len.append(len(mod_tokens))
|
|
for i in range(downto, upto):
|
|
sent_count.append(1000.*(sum(j>= i for j in sent_len))/nsent)
|
|
features.append(sent_count)
|
|
|
|
f_names = [f'sentlength::{c}' for c in range(downto, upto)]
|
|
F = np.array(features)
|
|
print(f'task sentence lengths (#features={F.shape[1]}) [Done]')
|
|
return F, f_names
|
|
|
|
|
|
def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=1, ngrams=(1, 1)):
|
|
"""
|
|
Extract features as tfidf matrix extracted from the documents
|
|
:param documents: a list where each element is the text (string) of a document
|
|
:return: a tuple M,V, where M is an np.array of shape (D,F), with D being the len(documents) and F the number of
|
|
distinct words; and V is the TfidfVectorizer already fit
|
|
"""
|
|
if vectorizer is None:
|
|
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams)
|
|
vectorizer.fit(documents)
|
|
|
|
features = vectorizer.transform(documents)
|
|
index2word = {i: w for w, i in vectorizer.vocabulary_.items()}
|
|
f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))]
|
|
|
|
if feat_sel_ratio < 1.:
|
|
if selector is None:
|
|
selector = _feature_selection(features, y, feat_sel_ratio)
|
|
|
|
features = selector.transform(features)
|
|
f_names = [f_names[i] for i in selector.get_support(indices=True)]
|
|
|
|
print(f'task ngrams and feature selection (#features={features.shape[1]}) [Done]')
|
|
return features, f_names, vectorizer, selector
|
|
|
|
|
|
def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=10, preserve_punctuation=True, ngrams=[4, 5]):
|
|
"""
|
|
Extract char-ngrams
|
|
This implementation is generic, following Sapkota et al. (ref [39] in the PAN 2015 overview), i.e., containing
|
|
punctuation marks. However, this does not apply to Latin texts in which punctuation marks are filtered-out. More
|
|
recently, it was shown that character n-grams corresponding to word affixes and including punctuation marks are the
|
|
most significant features in cross-topic authorship attribution [57].
|
|
:param documents: a list where each element is the text (string) of a document
|
|
:param ns: the lenghts (n) for which n-gram frequencies will be computed
|
|
:param vectorizer: the tfidf_vectorizer to use if already fit; if None, a new one will be instantiated and fit
|
|
:param min_df: minumum number of occurrences needed for the ngram to be taken
|
|
:param preserve_punctuation: whether or not to preserve punctuation marks
|
|
:return: see _features_tfidf
|
|
"""
|
|
doc_ngrams = ngrams_extractor(documents, ngrams, preserve_punctuation)
|
|
return _features_word_ngrams(
|
|
doc_ngrams,
|
|
vectorizer=vectorizer,
|
|
selector=selector, y=y, feat_sel_ratio=feat_sel_ratio,
|
|
min_df=min_df
|
|
)
|
|
|
|
|
|
def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True):
|
|
if not isinstance(ns, list): ns=[ns]
|
|
ns = sorted(np.unique(ns).tolist())
|
|
|
|
list_ngrams = []
|
|
for doc in documents:
|
|
if preserve_punctuation == False:
|
|
doc = ' '.join(tokenize(doc))
|
|
doc_ngrams = []
|
|
for ni in ns:
|
|
doc_ngrams.extend([doc[i:i + ni].replace(' ', '_') for i in range(len(doc) - ni + 1)])
|
|
|
|
list_ngrams.append(' '.join(doc_ngrams))
|
|
|
|
return list_ngrams
|
|
|
|
|
|
def _feature_selection(X, y, tfidf_feat_selection_ratio):
|
|
"""
|
|
Filter-style feature selection based on Chi-squared as the term selection reduction function
|
|
:param X: a document by (sparse) features matrix
|
|
:param y: the supervised ndarray containing the class labels
|
|
:param tfidf_feat_selection_ratio: a proportion of features to be taken
|
|
:return: the reduced matrix and the feature selector fit
|
|
"""
|
|
nF = X.shape[1]
|
|
num_feats = int(tfidf_feat_selection_ratio * nF)
|
|
feature_selector = SelectKBest(chi2, k=num_feats)
|
|
return feature_selector.fit(X, y)
|
|
|
|
|
|
def _tocsr(X):
|
|
""" Converts a dense matrix into a sparse one """
|
|
return X if issparse(X) else csr_matrix(X)
|
|
|
|
|
|
class FeatureExtractor:
|
|
"""
|
|
A feature extractor for authorship analysis applications implemented as a transformer
|
|
"""
|
|
def __init__(self,
|
|
function_words_freq=None,
|
|
conjugations_freq=None,
|
|
features_Mendenhall=True,
|
|
features_sentenceLengths=True,
|
|
wordngrams=False,
|
|
feature_selection_ratio=1.,
|
|
n_wordngrams=(1, 1),
|
|
charngrams=False,
|
|
n_charngrams=[4, 5],
|
|
preserve_punctuation=True,
|
|
split_documents=False,
|
|
split_policy=split_by_endline,
|
|
normalize_features=True,
|
|
window_size=5,
|
|
verbose=True):
|
|
"""
|
|
Applies stlystic feature extraction. Features include:
|
|
:param function_words_freq: add the frequency of function words as features
|
|
:param conjugations_freq: add the frequency of regular conjugations as features
|
|
:param features_Mendenhall: add the frequencies of the words' lengths as features
|
|
:param features_sentenceLengths: add the frequencies of the sentences' lengths as features
|
|
:param wordngrams: add the words tfidf as features
|
|
:param feature_selection_ratio: if less than 1, indicates the ratio of most important features (according
|
|
to chi-squared test) to be selected
|
|
:param n_wordngrams: a tuple (min,max) indicating the range of lengths for word n-grams
|
|
:param charngrams: add the char n-grams tfidf as features
|
|
:param n_charngrams: a tuple (min,max) indicating the range of lengths for char n-grams
|
|
:param preserve_punctuation: whether or not to preserver punctuation marks (should be deactivated for medieval
|
|
Latin texts)
|
|
:param split_documents: whether to split text into smaller documents or not (currently, the policy is to split by '\n').
|
|
Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace
|
|
the full documents, which are anyway retained).
|
|
:param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
|
|
:param window_size: the size of the window in case of sliding windows policy
|
|
:param verbose: show information by stdout or not
|
|
"""
|
|
self.function_words_freq = function_words_freq
|
|
self.conjugations_freq = conjugations_freq
|
|
self.features_Mendenhall = features_Mendenhall
|
|
self.features_sentenceLengths = features_sentenceLengths
|
|
self.wngrams = wordngrams
|
|
self.feature_selection_ratio = feature_selection_ratio
|
|
self.wngrams_range = n_wordngrams
|
|
self.cngrams = charngrams
|
|
self.cngrams_range = n_charngrams
|
|
self.preserve_punctuation = preserve_punctuation
|
|
self.split_documents = split_documents
|
|
self.split_policy = split_policy
|
|
self.normalize_features = normalize_features
|
|
self.window_size = window_size
|
|
self.verbose = verbose
|
|
self.feature_names = None
|
|
self.wngrams_vectorizer = self.wngrams_selector = None
|
|
self.cngrams_vectorizer = self.cngrams_selector = None
|
|
|
|
def fit_transform(self, positives, negatives):
|
|
documents = positives + negatives
|
|
authors = [1]*len(positives) + [0]*len(negatives)
|
|
n_original_docs = len(documents)
|
|
groups = list(range(n_original_docs))
|
|
|
|
if self.split_documents:
|
|
doc_fragments, authors_fragments, groups_fragments = splitter(
|
|
documents, authors, split_policy=self.split_policy, window_size=self.window_size
|
|
)
|
|
documents.extend(doc_fragments)
|
|
authors.extend(authors_fragments)
|
|
groups.extend(groups_fragments)
|
|
self._print(f'splitting documents: {len(doc_fragments)} segments + '
|
|
f'{n_original_docs} documents = '
|
|
f'{len(documents)} total')
|
|
|
|
# represent the target vector
|
|
y = np.array(authors)
|
|
groups = np.array(groups)
|
|
|
|
X = self._transform_parallel(documents, y, fit=True)
|
|
|
|
if self.verbose:
|
|
print(
|
|
f'load_documents: function_words_freq={self.function_words_freq} '
|
|
f'features_Mendenhall={self.features_Mendenhall} tfidf={self.wngrams} '
|
|
f'split_documents={self.split_documents}, split_policy={self.split_policy.__name__}'
|
|
)
|
|
print(f'number of training (full) documents: {n_original_docs}')
|
|
print(f'y prevalence: {y.sum()}/{len(y)} {y.mean() * 100:.2f}%')
|
|
print()
|
|
|
|
return X, y, groups
|
|
|
|
def transform(self, test, return_fragments=False, window_size=-1, avoid_splitting=False):
|
|
if isinstance(test, str):
|
|
test = [test]
|
|
if window_size == -1:
|
|
window_size = self.window_size
|
|
|
|
if self.split_documents and not avoid_splitting:
|
|
tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size)
|
|
test.extend(tests)
|
|
|
|
old_verbose = self.verbose
|
|
self.verbose = False
|
|
TEST = self._transform_parallel(test, fit=False)
|
|
self.verbose = old_verbose
|
|
|
|
if return_fragments:
|
|
return TEST, test[1:]
|
|
else:
|
|
return TEST
|
|
|
|
def _addfeatures(self, X, F, feat_names=None):
|
|
if self.normalize_features:
|
|
normalize(F, axis=1, copy=False)
|
|
self._register_feature_names(feat_names)
|
|
|
|
if issparse(F):
|
|
return hstack((X, F)) # sparse
|
|
else:
|
|
return np.hstack((X, F)) # dense
|
|
|
|
def _print(self, msg):
|
|
if self.verbose:
|
|
print(msg)
|
|
|
|
def _register_feature_names(self, feat_names):
|
|
""" keeps track of the feature names (for debugging and analysis) """
|
|
if feat_names is None:
|
|
return
|
|
if self.feature_names is None:
|
|
self.feature_names = []
|
|
self.feature_names.extend(feat_names)
|
|
|
|
def _transform(self, documents, y=None, fit=False):
|
|
# initialize the document-by-feature vector
|
|
X = np.empty((len(documents), 0))
|
|
|
|
# dense feature extraction functions
|
|
if self.function_words_freq:
|
|
F, f_names = _features_function_words_freq(documents, self.function_words_freq)
|
|
X = self._addfeatures(X, F, f_names if fit else None)
|
|
self._print(f'adding function words features: {X.shape[1]} features')
|
|
|
|
if self.conjugations_freq:
|
|
F, f_names = _features_conjugations_freq(documents, self.conjugations_freq)
|
|
X = self._addfeatures(X, F, f_names if fit else None)
|
|
self._print(f'adding conjugation features: {X.shape[1]} features')
|
|
|
|
if self.features_Mendenhall:
|
|
F, f_names = _features_Mendenhall(documents)
|
|
X = self._addfeatures(X, F, f_names if fit else None)
|
|
self._print(f'adding Mendenhall words features: {X.shape[1]} features')
|
|
|
|
if self.features_sentenceLengths:
|
|
F, f_names = _features_sentenceLengths(documents)
|
|
X = self._addfeatures(X, F, f_names if fit else None)
|
|
self._print(f'adding sentence lengths features: {X.shape[1]} features')
|
|
|
|
# sparse feature extraction functions
|
|
if self.wngrams:
|
|
if fit:
|
|
X_features, self.wngrams_vectorizer = _features_word_ngrams(documents, ngrams=self.wngrams_range)
|
|
index2word = {i: w for w, i in self.wngrams_vectorizer.vocabulary_.items()}
|
|
f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))]
|
|
else:
|
|
X_features, _ = _features_word_ngrams(documents, self.wngrams_vectorizer)
|
|
f_names = None
|
|
|
|
if self.feature_selection_ratio < 1.:
|
|
if self.verbose: print('feature selection')
|
|
if fit:
|
|
X_features, self.feat_sel_tfidf = _feature_selection(X_features, y, self.feature_selection_ratio)
|
|
f_names = [f_names[i] for i in self.feat_sel_tfidf.get_support(indices=True)]
|
|
else:
|
|
X_features = self.feat_sel_tfidf.transform(X_features)
|
|
X = self._addfeatures(_tocsr(X), X_features, f_names)
|
|
self._print(f'adding tfidf words features: {X.shape[1]} features')
|
|
|
|
if self.cngrams:
|
|
if fit:
|
|
X_features, self.cngrams_vectorizer = _features_char_ngrams(
|
|
documents, self.cngrams_range, preserve_punctuation=self.preserve_punctuation
|
|
)
|
|
index2word = {i: w for w, i in self.cngrams_vectorizer.vocabulary_.items()}
|
|
f_names = [f'ngram::{index2word[i]}' for i in range(len(index2word))]
|
|
else:
|
|
X_features, _ = _features_char_ngrams(
|
|
documents, self.cngrams_range, vectorizer=self.cngrams_vectorizer,
|
|
preserve_punctuation=self.preserve_punctuation
|
|
)
|
|
f_names = None
|
|
|
|
if self.feature_selection_ratio < 1.:
|
|
if self.verbose: print('feature selection')
|
|
if fit:
|
|
X_features, self.cngrams_selector = _feature_selection(X_features, y, self.feature_selection_ratio)
|
|
f_names = [f_names[i] for i in self.cngrams_selector.get_support(indices=True)]
|
|
else:
|
|
X_features = self.cngrams_selector.transform(X_features)
|
|
|
|
X = self._addfeatures(_tocsr(X), X_features, f_names)
|
|
self._print(f'adding ngrams character features: {X.shape[1]} features')
|
|
|
|
if fit:
|
|
self.feature_names = np.asarray(self.feature_names)
|
|
|
|
self._print(f'X shape (#documents,#features): {X.shape}')
|
|
|
|
return X
|
|
|
|
def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1):
|
|
# initialize the document-by-feature vector
|
|
X = np.empty((len(documents), 0))
|
|
|
|
tasks = []
|
|
# dense feature extraction functions
|
|
if self.function_words_freq:
|
|
tasks.append((_features_function_words_freq, (documents, self.function_words_freq)))
|
|
|
|
if self.conjugations_freq:
|
|
tasks.append((_features_conjugations_freq, (documents, self.conjugations_freq)))
|
|
|
|
if self.features_Mendenhall:
|
|
tasks.append((_features_Mendenhall, (documents, 23)))
|
|
|
|
if self.features_sentenceLengths:
|
|
tasks.append((_features_sentenceLengths, (documents, 3, 70)))
|
|
|
|
self._print('extracting dense features in parallel')
|
|
outs = Parallel(n_jobs=n_jobs)(delayed(task)(*params) for task, params in tasks)
|
|
for F, feat_names in outs:
|
|
X = self._addfeatures(X, F, feat_names if fit else None)
|
|
|
|
# sparse feature extraction functions
|
|
tasks = []
|
|
if self.wngrams:
|
|
if not fit and self.wngrams_vectorizer is None:
|
|
raise ValueError('transform called before fit')
|
|
|
|
params={
|
|
'documents': documents,
|
|
'vectorizer': self.wngrams_vectorizer,
|
|
'selector': self.wngrams_selector,
|
|
'y': y,
|
|
'feat_sel_ratio': self.feature_selection_ratio,
|
|
'ngrams': self.wngrams_range
|
|
}
|
|
tasks.append((_features_word_ngrams, params))
|
|
|
|
if self.cngrams:
|
|
if not fit and self.cngrams_vectorizer is None:
|
|
raise ValueError('transform called before fit')
|
|
|
|
params={
|
|
'documents': documents,
|
|
'vectorizer': self.cngrams_vectorizer,
|
|
'selector': self.cngrams_selector,
|
|
'y': y,
|
|
'feat_sel_ratio': self.feature_selection_ratio,
|
|
'ngrams': self.cngrams_range,
|
|
'preserve_punctuation': self.preserve_punctuation
|
|
}
|
|
tasks.append((_features_char_ngrams, params))
|
|
|
|
self._print('extracting sparse features in parallel')
|
|
outs = Parallel(n_jobs=n_jobs)(delayed(task)(**params) for task, params in tasks)
|
|
for F, feat_names, vectorizer, selector in outs:
|
|
X = self._addfeatures(_tocsr(X), F, feat_names if fit else None)
|
|
if fit:
|
|
if self.wngrams and self.wngrams_vectorizer is None:
|
|
self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector
|
|
elif self.cngrams and self.cngrams_vectorizer is None:
|
|
self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector
|
|
|
|
if fit:
|
|
self.feature_names = np.asarray(self.feature_names)
|
|
|
|
self._print(f'X shape (#documents,#features): {X.shape}')
|
|
|
|
return X |