feature extraction fully parallelized; result log file added; cleaning

This commit is contained in:
Alejandro Moreo Fernandez 2020-04-03 11:21:09 +02:00
parent a3893c77fe
commit 843cfbe8fe
4 changed files with 89 additions and 149 deletions

View File

@ -1,3 +1,4 @@
import util._hide_sklearn_warnings
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from data.dante_loader import load_latin_corpus, list_authors from data.dante_loader import load_latin_corpus, list_authors
from data.features import * from data.features import *
@ -14,6 +15,7 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn
def main(): def main():
log = open(args.log, 'wt')
discarded = 0 discarded = 0
f1_scores = [] f1_scores = []
counters = [] counters = []
@ -30,6 +32,7 @@ def main():
files = np.asarray(pos_files + neg_files) files = np.asarray(pos_files + neg_files)
if len(positive) < 2: if len(positive) < 2:
discarded += 1 discarded += 1
print(f'discarding analysis for {author} which has only {len(positive)} documents')
continue continue
n_full_docs = len(positive) + len(negative) n_full_docs = len(positive) + len(negative)
@ -53,13 +56,14 @@ def main():
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
print('Fitting the Verificator') print('Fitting the Verificator')
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression) av = AuthorshipVerificator(nfolds=10)
av.fit(Xtr, ytr, groups) av.fit(Xtr, ytr)
if args.unknown: if args.unknown:
print(f'Checking for the hypothesis that {author} was the author of {args.unknown}') print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3) ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
av.predict_proba(ep, args.unknown) pred, _ = av.predict_proba(ep)
tee(f'{args.unknown}: Posterior probability for {author} is {pred:.3f}', log)
if args.loo: if args.loo:
print('Validating the Verificator (Leave-One-Out)') print('Validating the Verificator (Leave-One-Out)')
@ -68,7 +72,7 @@ def main():
) )
f1_scores.append(f1_from_counters(tp, fp, fn, tn)) f1_scores.append(f1_from_counters(tp, fp, fn, tn))
counters.append((tp, fp, fn, tn)) counters.append((tp, fp, fn, tn))
print(f'F1 for {author} = {f1_scores[-1]:.3f}') tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log)
if args.loo: if args.loo:
print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})') print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
@ -78,26 +82,35 @@ def main():
macro_f1 = f1_scores.mean() macro_f1 = f1_scores.mean()
micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist()) micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
print(f'Macro-F1 = {macro_f1:.3f}') tee(f'LOO Macro-F1 = {macro_f1:.3f}', log)
print(f'Micro-F1 = {micro_f1:.3f}') tee(f'LOO Micro-F1 = {micro_f1:.3f}', log)
print() print()
log.close()
def tee(msg, log):
print(msg)
log.write(f'{msg}\n')
log.flush()
if __name__ == '__main__': if __name__ == '__main__':
import os import os
# Training settings # Training settings
parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII') parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII')
parser.add_argument('corpuspath', type=str, metavar='PATH', parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH',
help=f'Path to the directory containing the corpus (documents must be named ' help=f'Path to the directory containing the corpus (documents must be named '
f'<author>_<texname>.txt') f'<author>_<texname>.txt)')
parser.add_argument('positive', type=str, default="Dante", parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check ' help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
f'every author') f'every author')
parser.add_argument('--loo', default=False, action='store_true', parser.add_argument('--loo', default=False, action='store_true',
help='submit each binary classifier to leave-one-out validation') help='submit each binary classifier to leave-one-out validation')
parser.add_argument('--unknown', type=str, metavar='PATH', default=None, parser.add_argument('--unknown', type=str, metavar='PATH', default=None,
help='path to the file of unknown paternity (default None)') help='path to the file of unknown paternity (default None)')
parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt',
help='path to the log file where to write the results (default ./results.txt)')
args = parser.parse_args() args = parser.parse_args()
@ -110,6 +123,7 @@ if __name__ == '__main__':
args.authors = [args.positive] args.authors = [args.positive]
assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.' assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.'
assert args.unknown is None or os.path.exists(args.unknown), 'unknown file does not exist' assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist'
assert args.unknown is None or os.path.exists(args.unknown), '"unknown file" does not exist'
main() main()

View File

@ -6,7 +6,7 @@ from sklearn.feature_selection import chi2
from sklearn.preprocessing import normalize from sklearn.preprocessing import normalize
from scipy.sparse import hstack, csr_matrix, issparse from scipy.sparse import hstack, csr_matrix, issparse
from nltk.corpus import stopwords from nltk.corpus import stopwords
from sklearn.externals.joblib import Parallel, delayed from joblib import Parallel, delayed
latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic', latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic',
@ -119,7 +119,7 @@ def _features_function_words_freq(documents, lang):
""" """
Extract features as the frequency (L1x1000) of the function words used in the documents Extract features as the frequency (L1x1000) of the function words used in the documents
:param documents: a list where each element is the text (string) of a document :param documents: a list where each element is the text (string) of a document
:return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words) :return: a dictionary containing the resulting features, feature names, and taskname
""" """
features = [] features = []
function_words = get_function_words(lang) function_words = get_function_words(lang)
@ -134,7 +134,7 @@ def _features_function_words_freq(documents, lang):
f_names = [f'funcw::{f}' for f in function_words] f_names = [f'funcw::{f}' for f in function_words]
F = np.array(features) F = np.array(features)
print(f'task function words (#features={F.shape[1]}) [Done]') print(f'task function words (#features={F.shape[1]}) [Done]')
return F, f_names return {'features': F, 'f_names':f_names, 'task': 'functionwords'}
def _features_conjugations_freq(documents, lang): def _features_conjugations_freq(documents, lang):
@ -142,7 +142,7 @@ def _features_conjugations_freq(documents, lang):
Extract features as the frequency (L1x1000) of the conjugations used in the documents. The method is heuristic, and Extract features as the frequency (L1x1000) of the conjugations used in the documents. The method is heuristic, and
actually searches for suffixes contained in the conjugation list. actually searches for suffixes contained in the conjugation list.
:param documents: a list where each element is the text (string) of a document :param documents: a list where each element is the text (string) of a document
:return: a np.array of shape (D,F) where D is len(documents) and F is len(conjugations) :return: a dictionary containing the resulting features, feature names, and taskname
""" """
features = [] features = []
conjugations = get_conjugations(lang) conjugations = get_conjugations(lang)
@ -162,7 +162,7 @@ def _features_conjugations_freq(documents, lang):
f_names = [f'conj::{f}' for f in conjugations] f_names = [f'conj::{f}' for f in conjugations]
F = np.array(features) F = np.array(features)
print(f'task conjugation features (#features={F.shape[1]}) [Done]') print(f'task conjugation features (#features={F.shape[1]}) [Done]')
return F, f_names return {'features': F, 'f_names':f_names, 'task': 'conjugations'}
def _features_Mendenhall(documents, upto=23): def _features_Mendenhall(documents, upto=23):
@ -170,7 +170,7 @@ def _features_Mendenhall(documents, upto=23):
Extract features as the frequency (L1x1000) of the words' lengths used in the documents, Extract features as the frequency (L1x1000) of the words' lengths used in the documents,
following the idea behind Mendenhall's Characteristic Curve of Composition following the idea behind Mendenhall's Characteristic Curve of Composition
:param documents: a list where each element is the text (string) of a document :param documents: a list where each element is the text (string) of a document
:return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered) :return: a dictionary containing the resulting features, feature names, and taskname
""" """
features = [] features = []
for text in documents: for text in documents:
@ -185,7 +185,7 @@ def _features_Mendenhall(documents, upto=23):
f_names = [f'mendenhall::{c}' for c in range(1,upto)] f_names = [f'mendenhall::{c}' for c in range(1,upto)]
F = np.array(features) F = np.array(features)
print(f'task Mendenhall features (#features={F.shape[1]}) [Done]') print(f'task Mendenhall features (#features={F.shape[1]}) [Done]')
return F, f_names return {'features': F, 'f_names':f_names, 'task': 'Mendenhall'}
def _features_sentenceLengths(documents, downto=3, upto=70): def _features_sentenceLengths(documents, downto=3, upto=70):
@ -194,7 +194,7 @@ def _features_sentenceLengths(documents, downto=3, upto=70):
:param documents: a list where each element is the text (string) of a document :param documents: a list where each element is the text (string) of a document
:param downto: minimal length considered :param downto: minimal length considered
:param upto: maximum length considered :param upto: maximum length considered
:return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered) :return: a dictionary containing the resulting features, feature names, and taskname
""" """
features = [] features = []
for text in documents: for text in documents:
@ -212,15 +212,14 @@ def _features_sentenceLengths(documents, downto=3, upto=70):
f_names = [f'sentlength::{c}' for c in range(downto, upto)] f_names = [f'sentlength::{c}' for c in range(downto, upto)]
F = np.array(features) F = np.array(features)
print(f'task sentence lengths (#features={F.shape[1]}) [Done]') print(f'task sentence lengths (#features={F.shape[1]}) [Done]')
return F, f_names return {'features': F, 'f_names':f_names, 'task': 'sentlength'}
def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=1, ngrams=(1, 1)): def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=1, ngrams=(1, 1)):
""" """
Extract features as tfidf matrix extracted from the documents Extract features as tfidf matrix extracted from the documents
:param documents: a list where each element is the text (string) of a document :param documents: a list where each element is the text (string) of a document
:return: a tuple M,V, where M is an np.array of shape (D,F), with D being the len(documents) and F the number of :return: a dictionary containing the resulting features, feature names, taskname, the vectorizer and the selector
distinct words; and V is the TfidfVectorizer already fit
""" """
if vectorizer is None: if vectorizer is None:
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams) vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams)
@ -238,7 +237,13 @@ def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, fea
f_names = [f_names[i] for i in selector.get_support(indices=True)] f_names = [f_names[i] for i in selector.get_support(indices=True)]
print(f'task ngrams and feature selection (#features={features.shape[1]}) [Done]') print(f'task ngrams and feature selection (#features={features.shape[1]}) [Done]')
return features, f_names, vectorizer, selector return {
'features': features,
'f_names': f_names,
'task': '_wngrams_task',
'vectorizer': vectorizer,
'selector': selector
}
def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=10, preserve_punctuation=True, ngrams=[4, 5]): def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=10, preserve_punctuation=True, ngrams=[4, 5]):
@ -253,24 +258,27 @@ def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, fea
:param vectorizer: the tfidf_vectorizer to use if already fit; if None, a new one will be instantiated and fit :param vectorizer: the tfidf_vectorizer to use if already fit; if None, a new one will be instantiated and fit
:param min_df: minumum number of occurrences needed for the ngram to be taken :param min_df: minumum number of occurrences needed for the ngram to be taken
:param preserve_punctuation: whether or not to preserve punctuation marks :param preserve_punctuation: whether or not to preserve punctuation marks
:return: see _features_tfidf :return: a dictionary containing the resulting features, feature names, taskname, the vectorizer and the selector
""" """
doc_ngrams = ngrams_extractor(documents, ngrams, preserve_punctuation) doc_ngrams = ngrams_extractor(documents, ngrams, preserve_punctuation)
return _features_word_ngrams( outs = _features_word_ngrams(
doc_ngrams, doc_ngrams,
vectorizer=vectorizer, vectorizer=vectorizer,
selector=selector, y=y, feat_sel_ratio=feat_sel_ratio, selector=selector, y=y, feat_sel_ratio=feat_sel_ratio,
min_df=min_df min_df=min_df
) )
outs['task'] = '_cngrams_task'
return outs
def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True): def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True):
if not isinstance(ns, list): ns=[ns] if not isinstance(ns, list):
ns=[ns]
ns = sorted(np.unique(ns).tolist()) ns = sorted(np.unique(ns).tolist())
list_ngrams = [] list_ngrams = []
for doc in documents: for doc in documents:
if preserve_punctuation == False: if not preserve_punctuation:
doc = ' '.join(tokenize(doc)) doc = ' '.join(tokenize(doc))
doc_ngrams = [] doc_ngrams = []
for ni in ns: for ni in ns:
@ -287,7 +295,7 @@ def _feature_selection(X, y, tfidf_feat_selection_ratio):
:param X: a document by (sparse) features matrix :param X: a document by (sparse) features matrix
:param y: the supervised ndarray containing the class labels :param y: the supervised ndarray containing the class labels
:param tfidf_feat_selection_ratio: a proportion of features to be taken :param tfidf_feat_selection_ratio: a proportion of features to be taken
:return: the reduced matrix and the feature selector fit :return: the feature selector fit
""" """
nF = X.shape[1] nF = X.shape[1]
num_feats = int(tfidf_feat_selection_ratio * nF) num_feats = int(tfidf_feat_selection_ratio * nF)
@ -321,7 +329,7 @@ class FeatureExtractor:
window_size=5, window_size=5,
verbose=True): verbose=True):
""" """
Applies stlystic feature extraction. Features include: Applies stilystic feature extraction. Features include:
:param function_words_freq: add the frequency of function words as features :param function_words_freq: add the frequency of function words as features
:param conjugations_freq: add the frequency of regular conjugations as features :param conjugations_freq: add the frequency of regular conjugations as features
:param features_Mendenhall: add the frequencies of the words' lengths as features :param features_Mendenhall: add the frequencies of the words' lengths as features
@ -437,113 +445,31 @@ class FeatureExtractor:
self.feature_names = [] self.feature_names = []
self.feature_names.extend(feat_names) self.feature_names.extend(feat_names)
def _transform(self, documents, y=None, fit=False):
# initialize the document-by-feature vector
X = np.empty((len(documents), 0))
# dense feature extraction functions
if self.function_words_freq:
F, f_names = _features_function_words_freq(documents, self.function_words_freq)
X = self._addfeatures(X, F, f_names if fit else None)
self._print(f'adding function words features: {X.shape[1]} features')
if self.conjugations_freq:
F, f_names = _features_conjugations_freq(documents, self.conjugations_freq)
X = self._addfeatures(X, F, f_names if fit else None)
self._print(f'adding conjugation features: {X.shape[1]} features')
if self.features_Mendenhall:
F, f_names = _features_Mendenhall(documents)
X = self._addfeatures(X, F, f_names if fit else None)
self._print(f'adding Mendenhall words features: {X.shape[1]} features')
if self.features_sentenceLengths:
F, f_names = _features_sentenceLengths(documents)
X = self._addfeatures(X, F, f_names if fit else None)
self._print(f'adding sentence lengths features: {X.shape[1]} features')
# sparse feature extraction functions
if self.wngrams:
if fit:
X_features, self.wngrams_vectorizer = _features_word_ngrams(documents, ngrams=self.wngrams_range)
index2word = {i: w for w, i in self.wngrams_vectorizer.vocabulary_.items()}
f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))]
else:
X_features, _ = _features_word_ngrams(documents, self.wngrams_vectorizer)
f_names = None
if self.feature_selection_ratio < 1.:
if self.verbose: print('feature selection')
if fit:
X_features, self.feat_sel_tfidf = _feature_selection(X_features, y, self.feature_selection_ratio)
f_names = [f_names[i] for i in self.feat_sel_tfidf.get_support(indices=True)]
else:
X_features = self.feat_sel_tfidf.transform(X_features)
X = self._addfeatures(_tocsr(X), X_features, f_names)
self._print(f'adding tfidf words features: {X.shape[1]} features')
if self.cngrams:
if fit:
X_features, self.cngrams_vectorizer = _features_char_ngrams(
documents, self.cngrams_range, preserve_punctuation=self.preserve_punctuation
)
index2word = {i: w for w, i in self.cngrams_vectorizer.vocabulary_.items()}
f_names = [f'ngram::{index2word[i]}' for i in range(len(index2word))]
else:
X_features, _ = _features_char_ngrams(
documents, self.cngrams_range, vectorizer=self.cngrams_vectorizer,
preserve_punctuation=self.preserve_punctuation
)
f_names = None
if self.feature_selection_ratio < 1.:
if self.verbose: print('feature selection')
if fit:
X_features, self.cngrams_selector = _feature_selection(X_features, y, self.feature_selection_ratio)
f_names = [f_names[i] for i in self.cngrams_selector.get_support(indices=True)]
else:
X_features = self.cngrams_selector.transform(X_features)
X = self._addfeatures(_tocsr(X), X_features, f_names)
self._print(f'adding ngrams character features: {X.shape[1]} features')
if fit:
self.feature_names = np.asarray(self.feature_names)
self._print(f'X shape (#documents,#features): {X.shape}')
return X
def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1): def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1):
# initialize the document-by-feature vector # initialize the document-by-feature vector
X = np.empty((len(documents), 0)) X = np.empty((len(documents), 0))
tasks = [] tasks = []
# dense feature extraction functions # dense feature extraction functions
if self.function_words_freq: if self.function_words_freq:
tasks.append((_features_function_words_freq, (documents, self.function_words_freq))) tasks.append((_features_function_words_freq, {'documents': documents, 'lang': self.function_words_freq}))
if self.conjugations_freq: if self.conjugations_freq:
tasks.append((_features_conjugations_freq, (documents, self.conjugations_freq))) tasks.append((_features_conjugations_freq, {'documents': documents, 'lang': self.conjugations_freq}))
if self.features_Mendenhall: if self.features_Mendenhall:
tasks.append((_features_Mendenhall, (documents, 23))) tasks.append((_features_Mendenhall, {'documents': documents, 'upto': 23}))
if self.features_sentenceLengths: if self.features_sentenceLengths:
tasks.append((_features_sentenceLengths, (documents, 3, 70))) tasks.append((_features_sentenceLengths, {'documents': documents, 'downto': 3, 'upto': 70}))
self._print('extracting dense features in parallel')
outs = Parallel(n_jobs=n_jobs)(delayed(task)(*params) for task, params in tasks)
for F, feat_names in outs:
X = self._addfeatures(X, F, feat_names if fit else None)
# sparse feature extraction functions # sparse feature extraction functions
tasks = []
if self.wngrams: if self.wngrams:
if not fit and self.wngrams_vectorizer is None: if not fit and self.wngrams_vectorizer is None:
raise ValueError('transform called before fit') raise ValueError('transform called before fit')
params={ params = {
'documents': documents, 'documents': documents,
'vectorizer': self.wngrams_vectorizer, 'vectorizer': self.wngrams_vectorizer,
'selector': self.wngrams_selector, 'selector': self.wngrams_selector,
@ -557,7 +483,7 @@ class FeatureExtractor:
if not fit and self.cngrams_vectorizer is None: if not fit and self.cngrams_vectorizer is None:
raise ValueError('transform called before fit') raise ValueError('transform called before fit')
params={ params = {
'documents': documents, 'documents': documents,
'vectorizer': self.cngrams_vectorizer, 'vectorizer': self.cngrams_vectorizer,
'selector': self.cngrams_selector, 'selector': self.cngrams_selector,
@ -568,15 +494,22 @@ class FeatureExtractor:
} }
tasks.append((_features_char_ngrams, params)) tasks.append((_features_char_ngrams, params))
self._print('extracting sparse features in parallel') self._print('extracting features in parallel')
outs = Parallel(n_jobs=n_jobs)(delayed(task)(**params) for task, params in tasks) outs = Parallel(n_jobs=n_jobs)(delayed(task)(**params) for task, params in tasks)
for F, feat_names, vectorizer, selector in outs:
X = self._addfeatures(_tocsr(X), F, feat_names if fit else None) # gather the tasks' outputs
if fit: for out in outs:
if self.wngrams and self.wngrams_vectorizer is None: taskname = out['task']
self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector if taskname not in {'_wngrams_task', '_cngrams_task'}:
elif self.cngrams and self.cngrams_vectorizer is None: X = self._addfeatures(X, out['features'], out['f_names'] if fit else None)
self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector else:
X = self._addfeatures(_tocsr(X), out['features'], out['f_names'] if fit else None)
if fit:
vectorizer, selector = out['vectorizer'], out['selector']
if taskname == '_wngrams_task' and self.wngrams_vectorizer is None:
self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector
elif taskname == '_cngrams_task' and self.cngrams_vectorizer is None:
self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector
if fit: if fit:
self.feature_names = np.asarray(self.feature_names) self.feature_names = np.asarray(self.feature_names)

View File

@ -1,6 +1,6 @@
from sklearn.metrics import make_scorer from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import * from sklearn.svm import *
from data.features import * from data.features import *
from util.evaluation import f1, get_counters from util.evaluation import f1, get_counters
@ -10,29 +10,21 @@ class AuthorshipVerificator:
def __init__(self, nfolds=10, def __init__(self, nfolds=10,
params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]}, params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]},
estimator=SVC,
author_name=None): author_name=None):
self.nfolds = nfolds self.nfolds = nfolds
self.params = params self.params = params
self.author_name = author_name if author_name else 'this author' self.author_name = author_name if author_name else 'this author'
if estimator is SVC: self.classifier = LogisticRegression()
self.params['kernel'] = ['linear', 'rbf']
self.probability = True
self.classifier = estimator(probability=self.probability)
elif estimator is LinearSVC:
self.probability = False
self.classifier = estimator()
elif estimator is LogisticRegression:
self.probability = True
self.classifier = LogisticRegression()
def fit(self,X,y,groups=None): def fit(self, X, y):
if not isinstance(y,np.ndarray): y=np.array(y) y = np.asarray(y)
positive_examples = y.sum() positive_examples = y.sum()
if positive_examples >= self.nfolds: if positive_examples >= self.nfolds:
print('optimizing {}'.format(self.classifier.__class__.__name__)) print('optimizing {}'.format(self.classifier.__class__.__name__))
folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y)) folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
self.estimator = GridSearchCV(self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1) self.estimator = GridSearchCV(
self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1
)
else: else:
self.estimator = self.classifier self.estimator = self.classifier
@ -46,7 +38,6 @@ class AuthorshipVerificator:
return self return self
def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False): def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
if groups is None: if groups is None:
print('Computing LOO without groups') print('Computing LOO without groups')
folds = list(LeaveOneOut().split(X, y)) folds = list(LeaveOneOut().split(X, y))
@ -59,8 +50,8 @@ class AuthorshipVerificator:
folds = [(train, np.min(test, keepdims=True)) for train, test in folds] folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1) scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
missclassified = '\n'.join(files[scores==0].tolist()) missclassified = '\n'.join(files[scores == 0].tolist())
print(scores) print('missclassified texts:')
print(missclassified) print(missclassified)
if counters and test_lowest_index_only: if counters and test_lowest_index_only:
@ -73,26 +64,24 @@ class AuthorshipVerificator:
else: else:
return scores.mean(), scores.std() return scores.mean(), scores.std()
def predict(self, test, epistola_name=''): def predict(self, test):
pred = self.estimator.predict(test) pred = self.estimator.predict(test)
full_doc_prediction = pred[0] full_doc_prediction = pred[0]
print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
if len(pred) > 1: if len(pred) > 1:
fragment_predictions = pred[1:] fragment_predictions = pred[1:]
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
return full_doc_prediction, fragment_predictions return full_doc_prediction, fragment_predictions
return full_doc_prediction, None return full_doc_prediction
def predict_proba(self, test, epistola_name=''): def predict_proba(self, test):
assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated' assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
pred = self.estimator.predict_proba(test) pred = self.estimator.predict_proba(test)
full_doc_prediction = pred[0,1] full_doc_prediction = pred[0,1]
print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
if len(pred) > 1: if len(pred) > 1:
fragment_predictions = pred[1:,1] fragment_predictions = pred[1:,1]
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
return full_doc_prediction, fragment_predictions return full_doc_prediction, fragment_predictions
return full_doc_prediction, None return full_doc_prediction, []

View File

@ -0,0 +1,4 @@
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn