feature extraction fully parallelized; result log file added; cleaning
This commit is contained in:
parent
a3893c77fe
commit
843cfbe8fe
|
|
@ -1,3 +1,4 @@
|
||||||
|
import util._hide_sklearn_warnings
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from data.dante_loader import load_latin_corpus, list_authors
|
from data.dante_loader import load_latin_corpus, list_authors
|
||||||
from data.features import *
|
from data.features import *
|
||||||
|
|
@ -14,6 +15,7 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
log = open(args.log, 'wt')
|
||||||
discarded = 0
|
discarded = 0
|
||||||
f1_scores = []
|
f1_scores = []
|
||||||
counters = []
|
counters = []
|
||||||
|
|
@ -30,6 +32,7 @@ def main():
|
||||||
files = np.asarray(pos_files + neg_files)
|
files = np.asarray(pos_files + neg_files)
|
||||||
if len(positive) < 2:
|
if len(positive) < 2:
|
||||||
discarded += 1
|
discarded += 1
|
||||||
|
print(f'discarding analysis for {author} which has only {len(positive)} documents')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
n_full_docs = len(positive) + len(negative)
|
n_full_docs = len(positive) + len(negative)
|
||||||
|
|
@ -53,13 +56,14 @@ def main():
|
||||||
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||||
|
|
||||||
print('Fitting the Verificator')
|
print('Fitting the Verificator')
|
||||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
|
av = AuthorshipVerificator(nfolds=10)
|
||||||
av.fit(Xtr, ytr, groups)
|
av.fit(Xtr, ytr)
|
||||||
|
|
||||||
if args.unknown:
|
if args.unknown:
|
||||||
print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
|
print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
|
||||||
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
|
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
|
||||||
av.predict_proba(ep, args.unknown)
|
pred, _ = av.predict_proba(ep)
|
||||||
|
tee(f'{args.unknown}: Posterior probability for {author} is {pred:.3f}', log)
|
||||||
|
|
||||||
if args.loo:
|
if args.loo:
|
||||||
print('Validating the Verificator (Leave-One-Out)')
|
print('Validating the Verificator (Leave-One-Out)')
|
||||||
|
|
@ -68,7 +72,7 @@ def main():
|
||||||
)
|
)
|
||||||
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
|
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
|
||||||
counters.append((tp, fp, fn, tn))
|
counters.append((tp, fp, fn, tn))
|
||||||
print(f'F1 for {author} = {f1_scores[-1]:.3f}')
|
tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log)
|
||||||
|
|
||||||
if args.loo:
|
if args.loo:
|
||||||
print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
|
print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
|
||||||
|
|
@ -78,26 +82,35 @@ def main():
|
||||||
macro_f1 = f1_scores.mean()
|
macro_f1 = f1_scores.mean()
|
||||||
micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
|
micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
|
||||||
|
|
||||||
print(f'Macro-F1 = {macro_f1:.3f}')
|
tee(f'LOO Macro-F1 = {macro_f1:.3f}', log)
|
||||||
print(f'Micro-F1 = {micro_f1:.3f}')
|
tee(f'LOO Micro-F1 = {micro_f1:.3f}', log)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
log.close()
|
||||||
|
|
||||||
|
def tee(msg, log):
|
||||||
|
print(msg)
|
||||||
|
log.write(f'{msg}\n')
|
||||||
|
log.flush()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Training settings
|
# Training settings
|
||||||
parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII')
|
parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII')
|
||||||
parser.add_argument('corpuspath', type=str, metavar='PATH',
|
parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH',
|
||||||
help=f'Path to the directory containing the corpus (documents must be named '
|
help=f'Path to the directory containing the corpus (documents must be named '
|
||||||
f'<author>_<texname>.txt')
|
f'<author>_<texname>.txt)')
|
||||||
parser.add_argument('positive', type=str, default="Dante",
|
parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
|
||||||
help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
|
help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
|
||||||
f'every author')
|
f'every author')
|
||||||
parser.add_argument('--loo', default=False, action='store_true',
|
parser.add_argument('--loo', default=False, action='store_true',
|
||||||
help='submit each binary classifier to leave-one-out validation')
|
help='submit each binary classifier to leave-one-out validation')
|
||||||
parser.add_argument('--unknown', type=str, metavar='PATH', default=None,
|
parser.add_argument('--unknown', type=str, metavar='PATH', default=None,
|
||||||
help='path to the file of unknown paternity (default None)')
|
help='path to the file of unknown paternity (default None)')
|
||||||
|
parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt',
|
||||||
|
help='path to the log file where to write the results (default ./results.txt)')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
@ -110,6 +123,7 @@ if __name__ == '__main__':
|
||||||
args.authors = [args.positive]
|
args.authors = [args.positive]
|
||||||
|
|
||||||
assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.'
|
assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.'
|
||||||
assert args.unknown is None or os.path.exists(args.unknown), 'unknown file does not exist'
|
assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist'
|
||||||
|
assert args.unknown is None or os.path.exists(args.unknown), '"unknown file" does not exist'
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ from sklearn.feature_selection import chi2
|
||||||
from sklearn.preprocessing import normalize
|
from sklearn.preprocessing import normalize
|
||||||
from scipy.sparse import hstack, csr_matrix, issparse
|
from scipy.sparse import hstack, csr_matrix, issparse
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from sklearn.externals.joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
|
|
||||||
|
|
||||||
latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic',
|
latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic',
|
||||||
|
|
@ -119,7 +119,7 @@ def _features_function_words_freq(documents, lang):
|
||||||
"""
|
"""
|
||||||
Extract features as the frequency (L1x1000) of the function words used in the documents
|
Extract features as the frequency (L1x1000) of the function words used in the documents
|
||||||
:param documents: a list where each element is the text (string) of a document
|
:param documents: a list where each element is the text (string) of a document
|
||||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
|
:return: a dictionary containing the resulting features, feature names, and taskname
|
||||||
"""
|
"""
|
||||||
features = []
|
features = []
|
||||||
function_words = get_function_words(lang)
|
function_words = get_function_words(lang)
|
||||||
|
|
@ -134,7 +134,7 @@ def _features_function_words_freq(documents, lang):
|
||||||
f_names = [f'funcw::{f}' for f in function_words]
|
f_names = [f'funcw::{f}' for f in function_words]
|
||||||
F = np.array(features)
|
F = np.array(features)
|
||||||
print(f'task function words (#features={F.shape[1]}) [Done]')
|
print(f'task function words (#features={F.shape[1]}) [Done]')
|
||||||
return F, f_names
|
return {'features': F, 'f_names':f_names, 'task': 'functionwords'}
|
||||||
|
|
||||||
|
|
||||||
def _features_conjugations_freq(documents, lang):
|
def _features_conjugations_freq(documents, lang):
|
||||||
|
|
@ -142,7 +142,7 @@ def _features_conjugations_freq(documents, lang):
|
||||||
Extract features as the frequency (L1x1000) of the conjugations used in the documents. The method is heuristic, and
|
Extract features as the frequency (L1x1000) of the conjugations used in the documents. The method is heuristic, and
|
||||||
actually searches for suffixes contained in the conjugation list.
|
actually searches for suffixes contained in the conjugation list.
|
||||||
:param documents: a list where each element is the text (string) of a document
|
:param documents: a list where each element is the text (string) of a document
|
||||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(conjugations)
|
:return: a dictionary containing the resulting features, feature names, and taskname
|
||||||
"""
|
"""
|
||||||
features = []
|
features = []
|
||||||
conjugations = get_conjugations(lang)
|
conjugations = get_conjugations(lang)
|
||||||
|
|
@ -162,7 +162,7 @@ def _features_conjugations_freq(documents, lang):
|
||||||
f_names = [f'conj::{f}' for f in conjugations]
|
f_names = [f'conj::{f}' for f in conjugations]
|
||||||
F = np.array(features)
|
F = np.array(features)
|
||||||
print(f'task conjugation features (#features={F.shape[1]}) [Done]')
|
print(f'task conjugation features (#features={F.shape[1]}) [Done]')
|
||||||
return F, f_names
|
return {'features': F, 'f_names':f_names, 'task': 'conjugations'}
|
||||||
|
|
||||||
|
|
||||||
def _features_Mendenhall(documents, upto=23):
|
def _features_Mendenhall(documents, upto=23):
|
||||||
|
|
@ -170,7 +170,7 @@ def _features_Mendenhall(documents, upto=23):
|
||||||
Extract features as the frequency (L1x1000) of the words' lengths used in the documents,
|
Extract features as the frequency (L1x1000) of the words' lengths used in the documents,
|
||||||
following the idea behind Mendenhall's Characteristic Curve of Composition
|
following the idea behind Mendenhall's Characteristic Curve of Composition
|
||||||
:param documents: a list where each element is the text (string) of a document
|
:param documents: a list where each element is the text (string) of a document
|
||||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
|
:return: a dictionary containing the resulting features, feature names, and taskname
|
||||||
"""
|
"""
|
||||||
features = []
|
features = []
|
||||||
for text in documents:
|
for text in documents:
|
||||||
|
|
@ -185,7 +185,7 @@ def _features_Mendenhall(documents, upto=23):
|
||||||
f_names = [f'mendenhall::{c}' for c in range(1,upto)]
|
f_names = [f'mendenhall::{c}' for c in range(1,upto)]
|
||||||
F = np.array(features)
|
F = np.array(features)
|
||||||
print(f'task Mendenhall features (#features={F.shape[1]}) [Done]')
|
print(f'task Mendenhall features (#features={F.shape[1]}) [Done]')
|
||||||
return F, f_names
|
return {'features': F, 'f_names':f_names, 'task': 'Mendenhall'}
|
||||||
|
|
||||||
|
|
||||||
def _features_sentenceLengths(documents, downto=3, upto=70):
|
def _features_sentenceLengths(documents, downto=3, upto=70):
|
||||||
|
|
@ -194,7 +194,7 @@ def _features_sentenceLengths(documents, downto=3, upto=70):
|
||||||
:param documents: a list where each element is the text (string) of a document
|
:param documents: a list where each element is the text (string) of a document
|
||||||
:param downto: minimal length considered
|
:param downto: minimal length considered
|
||||||
:param upto: maximum length considered
|
:param upto: maximum length considered
|
||||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
|
:return: a dictionary containing the resulting features, feature names, and taskname
|
||||||
"""
|
"""
|
||||||
features = []
|
features = []
|
||||||
for text in documents:
|
for text in documents:
|
||||||
|
|
@ -212,15 +212,14 @@ def _features_sentenceLengths(documents, downto=3, upto=70):
|
||||||
f_names = [f'sentlength::{c}' for c in range(downto, upto)]
|
f_names = [f'sentlength::{c}' for c in range(downto, upto)]
|
||||||
F = np.array(features)
|
F = np.array(features)
|
||||||
print(f'task sentence lengths (#features={F.shape[1]}) [Done]')
|
print(f'task sentence lengths (#features={F.shape[1]}) [Done]')
|
||||||
return F, f_names
|
return {'features': F, 'f_names':f_names, 'task': 'sentlength'}
|
||||||
|
|
||||||
|
|
||||||
def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=1, ngrams=(1, 1)):
|
def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=1, ngrams=(1, 1)):
|
||||||
"""
|
"""
|
||||||
Extract features as tfidf matrix extracted from the documents
|
Extract features as tfidf matrix extracted from the documents
|
||||||
:param documents: a list where each element is the text (string) of a document
|
:param documents: a list where each element is the text (string) of a document
|
||||||
:return: a tuple M,V, where M is an np.array of shape (D,F), with D being the len(documents) and F the number of
|
:return: a dictionary containing the resulting features, feature names, taskname, the vectorizer and the selector
|
||||||
distinct words; and V is the TfidfVectorizer already fit
|
|
||||||
"""
|
"""
|
||||||
if vectorizer is None:
|
if vectorizer is None:
|
||||||
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams)
|
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams)
|
||||||
|
|
@ -238,7 +237,13 @@ def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, fea
|
||||||
f_names = [f_names[i] for i in selector.get_support(indices=True)]
|
f_names = [f_names[i] for i in selector.get_support(indices=True)]
|
||||||
|
|
||||||
print(f'task ngrams and feature selection (#features={features.shape[1]}) [Done]')
|
print(f'task ngrams and feature selection (#features={features.shape[1]}) [Done]')
|
||||||
return features, f_names, vectorizer, selector
|
return {
|
||||||
|
'features': features,
|
||||||
|
'f_names': f_names,
|
||||||
|
'task': '_wngrams_task',
|
||||||
|
'vectorizer': vectorizer,
|
||||||
|
'selector': selector
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=10, preserve_punctuation=True, ngrams=[4, 5]):
|
def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=10, preserve_punctuation=True, ngrams=[4, 5]):
|
||||||
|
|
@ -253,24 +258,27 @@ def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, fea
|
||||||
:param vectorizer: the tfidf_vectorizer to use if already fit; if None, a new one will be instantiated and fit
|
:param vectorizer: the tfidf_vectorizer to use if already fit; if None, a new one will be instantiated and fit
|
||||||
:param min_df: minumum number of occurrences needed for the ngram to be taken
|
:param min_df: minumum number of occurrences needed for the ngram to be taken
|
||||||
:param preserve_punctuation: whether or not to preserve punctuation marks
|
:param preserve_punctuation: whether or not to preserve punctuation marks
|
||||||
:return: see _features_tfidf
|
:return: a dictionary containing the resulting features, feature names, taskname, the vectorizer and the selector
|
||||||
"""
|
"""
|
||||||
doc_ngrams = ngrams_extractor(documents, ngrams, preserve_punctuation)
|
doc_ngrams = ngrams_extractor(documents, ngrams, preserve_punctuation)
|
||||||
return _features_word_ngrams(
|
outs = _features_word_ngrams(
|
||||||
doc_ngrams,
|
doc_ngrams,
|
||||||
vectorizer=vectorizer,
|
vectorizer=vectorizer,
|
||||||
selector=selector, y=y, feat_sel_ratio=feat_sel_ratio,
|
selector=selector, y=y, feat_sel_ratio=feat_sel_ratio,
|
||||||
min_df=min_df
|
min_df=min_df
|
||||||
)
|
)
|
||||||
|
outs['task'] = '_cngrams_task'
|
||||||
|
return outs
|
||||||
|
|
||||||
|
|
||||||
def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True):
|
def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True):
|
||||||
if not isinstance(ns, list): ns=[ns]
|
if not isinstance(ns, list):
|
||||||
|
ns=[ns]
|
||||||
ns = sorted(np.unique(ns).tolist())
|
ns = sorted(np.unique(ns).tolist())
|
||||||
|
|
||||||
list_ngrams = []
|
list_ngrams = []
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
if preserve_punctuation == False:
|
if not preserve_punctuation:
|
||||||
doc = ' '.join(tokenize(doc))
|
doc = ' '.join(tokenize(doc))
|
||||||
doc_ngrams = []
|
doc_ngrams = []
|
||||||
for ni in ns:
|
for ni in ns:
|
||||||
|
|
@ -287,7 +295,7 @@ def _feature_selection(X, y, tfidf_feat_selection_ratio):
|
||||||
:param X: a document by (sparse) features matrix
|
:param X: a document by (sparse) features matrix
|
||||||
:param y: the supervised ndarray containing the class labels
|
:param y: the supervised ndarray containing the class labels
|
||||||
:param tfidf_feat_selection_ratio: a proportion of features to be taken
|
:param tfidf_feat_selection_ratio: a proportion of features to be taken
|
||||||
:return: the reduced matrix and the feature selector fit
|
:return: the feature selector fit
|
||||||
"""
|
"""
|
||||||
nF = X.shape[1]
|
nF = X.shape[1]
|
||||||
num_feats = int(tfidf_feat_selection_ratio * nF)
|
num_feats = int(tfidf_feat_selection_ratio * nF)
|
||||||
|
|
@ -321,7 +329,7 @@ class FeatureExtractor:
|
||||||
window_size=5,
|
window_size=5,
|
||||||
verbose=True):
|
verbose=True):
|
||||||
"""
|
"""
|
||||||
Applies stlystic feature extraction. Features include:
|
Applies stilystic feature extraction. Features include:
|
||||||
:param function_words_freq: add the frequency of function words as features
|
:param function_words_freq: add the frequency of function words as features
|
||||||
:param conjugations_freq: add the frequency of regular conjugations as features
|
:param conjugations_freq: add the frequency of regular conjugations as features
|
||||||
:param features_Mendenhall: add the frequencies of the words' lengths as features
|
:param features_Mendenhall: add the frequencies of the words' lengths as features
|
||||||
|
|
@ -437,113 +445,31 @@ class FeatureExtractor:
|
||||||
self.feature_names = []
|
self.feature_names = []
|
||||||
self.feature_names.extend(feat_names)
|
self.feature_names.extend(feat_names)
|
||||||
|
|
||||||
def _transform(self, documents, y=None, fit=False):
|
|
||||||
# initialize the document-by-feature vector
|
|
||||||
X = np.empty((len(documents), 0))
|
|
||||||
|
|
||||||
# dense feature extraction functions
|
|
||||||
if self.function_words_freq:
|
|
||||||
F, f_names = _features_function_words_freq(documents, self.function_words_freq)
|
|
||||||
X = self._addfeatures(X, F, f_names if fit else None)
|
|
||||||
self._print(f'adding function words features: {X.shape[1]} features')
|
|
||||||
|
|
||||||
if self.conjugations_freq:
|
|
||||||
F, f_names = _features_conjugations_freq(documents, self.conjugations_freq)
|
|
||||||
X = self._addfeatures(X, F, f_names if fit else None)
|
|
||||||
self._print(f'adding conjugation features: {X.shape[1]} features')
|
|
||||||
|
|
||||||
if self.features_Mendenhall:
|
|
||||||
F, f_names = _features_Mendenhall(documents)
|
|
||||||
X = self._addfeatures(X, F, f_names if fit else None)
|
|
||||||
self._print(f'adding Mendenhall words features: {X.shape[1]} features')
|
|
||||||
|
|
||||||
if self.features_sentenceLengths:
|
|
||||||
F, f_names = _features_sentenceLengths(documents)
|
|
||||||
X = self._addfeatures(X, F, f_names if fit else None)
|
|
||||||
self._print(f'adding sentence lengths features: {X.shape[1]} features')
|
|
||||||
|
|
||||||
# sparse feature extraction functions
|
|
||||||
if self.wngrams:
|
|
||||||
if fit:
|
|
||||||
X_features, self.wngrams_vectorizer = _features_word_ngrams(documents, ngrams=self.wngrams_range)
|
|
||||||
index2word = {i: w for w, i in self.wngrams_vectorizer.vocabulary_.items()}
|
|
||||||
f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))]
|
|
||||||
else:
|
|
||||||
X_features, _ = _features_word_ngrams(documents, self.wngrams_vectorizer)
|
|
||||||
f_names = None
|
|
||||||
|
|
||||||
if self.feature_selection_ratio < 1.:
|
|
||||||
if self.verbose: print('feature selection')
|
|
||||||
if fit:
|
|
||||||
X_features, self.feat_sel_tfidf = _feature_selection(X_features, y, self.feature_selection_ratio)
|
|
||||||
f_names = [f_names[i] for i in self.feat_sel_tfidf.get_support(indices=True)]
|
|
||||||
else:
|
|
||||||
X_features = self.feat_sel_tfidf.transform(X_features)
|
|
||||||
X = self._addfeatures(_tocsr(X), X_features, f_names)
|
|
||||||
self._print(f'adding tfidf words features: {X.shape[1]} features')
|
|
||||||
|
|
||||||
if self.cngrams:
|
|
||||||
if fit:
|
|
||||||
X_features, self.cngrams_vectorizer = _features_char_ngrams(
|
|
||||||
documents, self.cngrams_range, preserve_punctuation=self.preserve_punctuation
|
|
||||||
)
|
|
||||||
index2word = {i: w for w, i in self.cngrams_vectorizer.vocabulary_.items()}
|
|
||||||
f_names = [f'ngram::{index2word[i]}' for i in range(len(index2word))]
|
|
||||||
else:
|
|
||||||
X_features, _ = _features_char_ngrams(
|
|
||||||
documents, self.cngrams_range, vectorizer=self.cngrams_vectorizer,
|
|
||||||
preserve_punctuation=self.preserve_punctuation
|
|
||||||
)
|
|
||||||
f_names = None
|
|
||||||
|
|
||||||
if self.feature_selection_ratio < 1.:
|
|
||||||
if self.verbose: print('feature selection')
|
|
||||||
if fit:
|
|
||||||
X_features, self.cngrams_selector = _feature_selection(X_features, y, self.feature_selection_ratio)
|
|
||||||
f_names = [f_names[i] for i in self.cngrams_selector.get_support(indices=True)]
|
|
||||||
else:
|
|
||||||
X_features = self.cngrams_selector.transform(X_features)
|
|
||||||
|
|
||||||
X = self._addfeatures(_tocsr(X), X_features, f_names)
|
|
||||||
self._print(f'adding ngrams character features: {X.shape[1]} features')
|
|
||||||
|
|
||||||
if fit:
|
|
||||||
self.feature_names = np.asarray(self.feature_names)
|
|
||||||
|
|
||||||
self._print(f'X shape (#documents,#features): {X.shape}')
|
|
||||||
|
|
||||||
return X
|
|
||||||
|
|
||||||
def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1):
|
def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1):
|
||||||
# initialize the document-by-feature vector
|
# initialize the document-by-feature vector
|
||||||
X = np.empty((len(documents), 0))
|
X = np.empty((len(documents), 0))
|
||||||
|
|
||||||
tasks = []
|
tasks = []
|
||||||
|
|
||||||
# dense feature extraction functions
|
# dense feature extraction functions
|
||||||
if self.function_words_freq:
|
if self.function_words_freq:
|
||||||
tasks.append((_features_function_words_freq, (documents, self.function_words_freq)))
|
tasks.append((_features_function_words_freq, {'documents': documents, 'lang': self.function_words_freq}))
|
||||||
|
|
||||||
if self.conjugations_freq:
|
if self.conjugations_freq:
|
||||||
tasks.append((_features_conjugations_freq, (documents, self.conjugations_freq)))
|
tasks.append((_features_conjugations_freq, {'documents': documents, 'lang': self.conjugations_freq}))
|
||||||
|
|
||||||
if self.features_Mendenhall:
|
if self.features_Mendenhall:
|
||||||
tasks.append((_features_Mendenhall, (documents, 23)))
|
tasks.append((_features_Mendenhall, {'documents': documents, 'upto': 23}))
|
||||||
|
|
||||||
if self.features_sentenceLengths:
|
if self.features_sentenceLengths:
|
||||||
tasks.append((_features_sentenceLengths, (documents, 3, 70)))
|
tasks.append((_features_sentenceLengths, {'documents': documents, 'downto': 3, 'upto': 70}))
|
||||||
|
|
||||||
self._print('extracting dense features in parallel')
|
|
||||||
outs = Parallel(n_jobs=n_jobs)(delayed(task)(*params) for task, params in tasks)
|
|
||||||
for F, feat_names in outs:
|
|
||||||
X = self._addfeatures(X, F, feat_names if fit else None)
|
|
||||||
|
|
||||||
# sparse feature extraction functions
|
# sparse feature extraction functions
|
||||||
tasks = []
|
|
||||||
if self.wngrams:
|
if self.wngrams:
|
||||||
if not fit and self.wngrams_vectorizer is None:
|
if not fit and self.wngrams_vectorizer is None:
|
||||||
raise ValueError('transform called before fit')
|
raise ValueError('transform called before fit')
|
||||||
|
|
||||||
params={
|
params = {
|
||||||
'documents': documents,
|
'documents': documents,
|
||||||
'vectorizer': self.wngrams_vectorizer,
|
'vectorizer': self.wngrams_vectorizer,
|
||||||
'selector': self.wngrams_selector,
|
'selector': self.wngrams_selector,
|
||||||
|
|
@ -557,7 +483,7 @@ class FeatureExtractor:
|
||||||
if not fit and self.cngrams_vectorizer is None:
|
if not fit and self.cngrams_vectorizer is None:
|
||||||
raise ValueError('transform called before fit')
|
raise ValueError('transform called before fit')
|
||||||
|
|
||||||
params={
|
params = {
|
||||||
'documents': documents,
|
'documents': documents,
|
||||||
'vectorizer': self.cngrams_vectorizer,
|
'vectorizer': self.cngrams_vectorizer,
|
||||||
'selector': self.cngrams_selector,
|
'selector': self.cngrams_selector,
|
||||||
|
|
@ -568,15 +494,22 @@ class FeatureExtractor:
|
||||||
}
|
}
|
||||||
tasks.append((_features_char_ngrams, params))
|
tasks.append((_features_char_ngrams, params))
|
||||||
|
|
||||||
self._print('extracting sparse features in parallel')
|
self._print('extracting features in parallel')
|
||||||
outs = Parallel(n_jobs=n_jobs)(delayed(task)(**params) for task, params in tasks)
|
outs = Parallel(n_jobs=n_jobs)(delayed(task)(**params) for task, params in tasks)
|
||||||
for F, feat_names, vectorizer, selector in outs:
|
|
||||||
X = self._addfeatures(_tocsr(X), F, feat_names if fit else None)
|
# gather the tasks' outputs
|
||||||
if fit:
|
for out in outs:
|
||||||
if self.wngrams and self.wngrams_vectorizer is None:
|
taskname = out['task']
|
||||||
self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector
|
if taskname not in {'_wngrams_task', '_cngrams_task'}:
|
||||||
elif self.cngrams and self.cngrams_vectorizer is None:
|
X = self._addfeatures(X, out['features'], out['f_names'] if fit else None)
|
||||||
self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector
|
else:
|
||||||
|
X = self._addfeatures(_tocsr(X), out['features'], out['f_names'] if fit else None)
|
||||||
|
if fit:
|
||||||
|
vectorizer, selector = out['vectorizer'], out['selector']
|
||||||
|
if taskname == '_wngrams_task' and self.wngrams_vectorizer is None:
|
||||||
|
self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector
|
||||||
|
elif taskname == '_cngrams_task' and self.cngrams_vectorizer is None:
|
||||||
|
self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector
|
||||||
|
|
||||||
if fit:
|
if fit:
|
||||||
self.feature_names = np.asarray(self.feature_names)
|
self.feature_names = np.asarray(self.feature_names)
|
||||||
|
|
|
||||||
37
src/model.py
37
src/model.py
|
|
@ -1,6 +1,6 @@
|
||||||
from sklearn.metrics import make_scorer
|
from sklearn.metrics import make_scorer
|
||||||
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold
|
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||||
from sklearn.svm import *
|
from sklearn.svm import *
|
||||||
from data.features import *
|
from data.features import *
|
||||||
from util.evaluation import f1, get_counters
|
from util.evaluation import f1, get_counters
|
||||||
|
|
@ -10,29 +10,21 @@ class AuthorshipVerificator:
|
||||||
|
|
||||||
def __init__(self, nfolds=10,
|
def __init__(self, nfolds=10,
|
||||||
params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]},
|
params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]},
|
||||||
estimator=SVC,
|
|
||||||
author_name=None):
|
author_name=None):
|
||||||
self.nfolds = nfolds
|
self.nfolds = nfolds
|
||||||
self.params = params
|
self.params = params
|
||||||
self.author_name = author_name if author_name else 'this author'
|
self.author_name = author_name if author_name else 'this author'
|
||||||
if estimator is SVC:
|
self.classifier = LogisticRegression()
|
||||||
self.params['kernel'] = ['linear', 'rbf']
|
|
||||||
self.probability = True
|
|
||||||
self.classifier = estimator(probability=self.probability)
|
|
||||||
elif estimator is LinearSVC:
|
|
||||||
self.probability = False
|
|
||||||
self.classifier = estimator()
|
|
||||||
elif estimator is LogisticRegression:
|
|
||||||
self.probability = True
|
|
||||||
self.classifier = LogisticRegression()
|
|
||||||
|
|
||||||
def fit(self,X,y,groups=None):
|
def fit(self, X, y):
|
||||||
if not isinstance(y,np.ndarray): y=np.array(y)
|
y = np.asarray(y)
|
||||||
positive_examples = y.sum()
|
positive_examples = y.sum()
|
||||||
if positive_examples >= self.nfolds:
|
if positive_examples >= self.nfolds:
|
||||||
print('optimizing {}'.format(self.classifier.__class__.__name__))
|
print('optimizing {}'.format(self.classifier.__class__.__name__))
|
||||||
folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
|
folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
|
||||||
self.estimator = GridSearchCV(self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
self.estimator = GridSearchCV(
|
||||||
|
self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.estimator = self.classifier
|
self.estimator = self.classifier
|
||||||
|
|
||||||
|
|
@ -46,7 +38,6 @@ class AuthorshipVerificator:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
|
def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
|
||||||
|
|
||||||
if groups is None:
|
if groups is None:
|
||||||
print('Computing LOO without groups')
|
print('Computing LOO without groups')
|
||||||
folds = list(LeaveOneOut().split(X, y))
|
folds = list(LeaveOneOut().split(X, y))
|
||||||
|
|
@ -59,8 +50,8 @@ class AuthorshipVerificator:
|
||||||
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
|
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
|
||||||
|
|
||||||
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
||||||
missclassified = '\n'.join(files[scores==0].tolist())
|
missclassified = '\n'.join(files[scores == 0].tolist())
|
||||||
print(scores)
|
print('missclassified texts:')
|
||||||
print(missclassified)
|
print(missclassified)
|
||||||
|
|
||||||
if counters and test_lowest_index_only:
|
if counters and test_lowest_index_only:
|
||||||
|
|
@ -73,26 +64,24 @@ class AuthorshipVerificator:
|
||||||
else:
|
else:
|
||||||
return scores.mean(), scores.std()
|
return scores.mean(), scores.std()
|
||||||
|
|
||||||
def predict(self, test, epistola_name=''):
|
def predict(self, test):
|
||||||
pred = self.estimator.predict(test)
|
pred = self.estimator.predict(test)
|
||||||
full_doc_prediction = pred[0]
|
full_doc_prediction = pred[0]
|
||||||
print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
|
|
||||||
if len(pred) > 1:
|
if len(pred) > 1:
|
||||||
fragment_predictions = pred[1:]
|
fragment_predictions = pred[1:]
|
||||||
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
||||||
return full_doc_prediction, fragment_predictions
|
return full_doc_prediction, fragment_predictions
|
||||||
return full_doc_prediction, None
|
return full_doc_prediction
|
||||||
|
|
||||||
def predict_proba(self, test, epistola_name=''):
|
def predict_proba(self, test):
|
||||||
assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
|
assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
|
||||||
pred = self.estimator.predict_proba(test)
|
pred = self.estimator.predict_proba(test)
|
||||||
full_doc_prediction = pred[0,1]
|
full_doc_prediction = pred[0,1]
|
||||||
print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
|
|
||||||
if len(pred) > 1:
|
if len(pred) > 1:
|
||||||
fragment_predictions = pred[1:,1]
|
fragment_predictions = pred[1:,1]
|
||||||
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
||||||
return full_doc_prediction, fragment_predictions
|
return full_doc_prediction, fragment_predictions
|
||||||
return full_doc_prediction, None
|
return full_doc_prediction, []
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
def warn(*args, **kwargs):
|
||||||
|
pass
|
||||||
|
import warnings
|
||||||
|
warnings.warn = warn
|
||||||
Loading…
Reference in New Issue