feature extraction fully parallelized; result log file added; cleaning
This commit is contained in:
parent
a3893c77fe
commit
843cfbe8fe
|
|
@ -1,3 +1,4 @@
|
|||
import util._hide_sklearn_warnings
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from data.dante_loader import load_latin_corpus, list_authors
|
||||
from data.features import *
|
||||
|
|
@ -14,6 +15,7 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn
|
|||
|
||||
|
||||
def main():
|
||||
log = open(args.log, 'wt')
|
||||
discarded = 0
|
||||
f1_scores = []
|
||||
counters = []
|
||||
|
|
@ -30,6 +32,7 @@ def main():
|
|||
files = np.asarray(pos_files + neg_files)
|
||||
if len(positive) < 2:
|
||||
discarded += 1
|
||||
print(f'discarding analysis for {author} which has only {len(positive)} documents')
|
||||
continue
|
||||
|
||||
n_full_docs = len(positive) + len(negative)
|
||||
|
|
@ -53,13 +56,14 @@ def main():
|
|||
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||
|
||||
print('Fitting the Verificator')
|
||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
|
||||
av.fit(Xtr, ytr, groups)
|
||||
av = AuthorshipVerificator(nfolds=10)
|
||||
av.fit(Xtr, ytr)
|
||||
|
||||
if args.unknown:
|
||||
print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
|
||||
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
|
||||
av.predict_proba(ep, args.unknown)
|
||||
pred, _ = av.predict_proba(ep)
|
||||
tee(f'{args.unknown}: Posterior probability for {author} is {pred:.3f}', log)
|
||||
|
||||
if args.loo:
|
||||
print('Validating the Verificator (Leave-One-Out)')
|
||||
|
|
@ -68,7 +72,7 @@ def main():
|
|||
)
|
||||
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
|
||||
counters.append((tp, fp, fn, tn))
|
||||
print(f'F1 for {author} = {f1_scores[-1]:.3f}')
|
||||
tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log)
|
||||
|
||||
if args.loo:
|
||||
print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
|
||||
|
|
@ -78,26 +82,35 @@ def main():
|
|||
macro_f1 = f1_scores.mean()
|
||||
micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
|
||||
|
||||
print(f'Macro-F1 = {macro_f1:.3f}')
|
||||
print(f'Micro-F1 = {micro_f1:.3f}')
|
||||
tee(f'LOO Macro-F1 = {macro_f1:.3f}', log)
|
||||
tee(f'LOO Micro-F1 = {micro_f1:.3f}', log)
|
||||
print()
|
||||
|
||||
log.close()
|
||||
|
||||
def tee(msg, log):
|
||||
print(msg)
|
||||
log.write(f'{msg}\n')
|
||||
log.flush()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import os
|
||||
|
||||
# Training settings
|
||||
parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII')
|
||||
parser.add_argument('corpuspath', type=str, metavar='PATH',
|
||||
parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH',
|
||||
help=f'Path to the directory containing the corpus (documents must be named '
|
||||
f'<author>_<texname>.txt')
|
||||
parser.add_argument('positive', type=str, default="Dante",
|
||||
f'<author>_<texname>.txt)')
|
||||
parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
|
||||
help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
|
||||
f'every author')
|
||||
parser.add_argument('--loo', default=False, action='store_true',
|
||||
help='submit each binary classifier to leave-one-out validation')
|
||||
parser.add_argument('--unknown', type=str, metavar='PATH', default=None,
|
||||
help='path to the file of unknown paternity (default None)')
|
||||
parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt',
|
||||
help='path to the log file where to write the results (default ./results.txt)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
@ -110,6 +123,7 @@ if __name__ == '__main__':
|
|||
args.authors = [args.positive]
|
||||
|
||||
assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.'
|
||||
assert args.unknown is None or os.path.exists(args.unknown), 'unknown file does not exist'
|
||||
assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist'
|
||||
assert args.unknown is None or os.path.exists(args.unknown), '"unknown file" does not exist'
|
||||
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from sklearn.feature_selection import chi2
|
|||
from sklearn.preprocessing import normalize
|
||||
from scipy.sparse import hstack, csr_matrix, issparse
|
||||
from nltk.corpus import stopwords
|
||||
from sklearn.externals.joblib import Parallel, delayed
|
||||
from joblib import Parallel, delayed
|
||||
|
||||
|
||||
latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic',
|
||||
|
|
@ -119,7 +119,7 @@ def _features_function_words_freq(documents, lang):
|
|||
"""
|
||||
Extract features as the frequency (L1x1000) of the function words used in the documents
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
|
||||
:return: a dictionary containing the resulting features, feature names, and taskname
|
||||
"""
|
||||
features = []
|
||||
function_words = get_function_words(lang)
|
||||
|
|
@ -134,7 +134,7 @@ def _features_function_words_freq(documents, lang):
|
|||
f_names = [f'funcw::{f}' for f in function_words]
|
||||
F = np.array(features)
|
||||
print(f'task function words (#features={F.shape[1]}) [Done]')
|
||||
return F, f_names
|
||||
return {'features': F, 'f_names':f_names, 'task': 'functionwords'}
|
||||
|
||||
|
||||
def _features_conjugations_freq(documents, lang):
|
||||
|
|
@ -142,7 +142,7 @@ def _features_conjugations_freq(documents, lang):
|
|||
Extract features as the frequency (L1x1000) of the conjugations used in the documents. The method is heuristic, and
|
||||
actually searches for suffixes contained in the conjugation list.
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(conjugations)
|
||||
:return: a dictionary containing the resulting features, feature names, and taskname
|
||||
"""
|
||||
features = []
|
||||
conjugations = get_conjugations(lang)
|
||||
|
|
@ -162,7 +162,7 @@ def _features_conjugations_freq(documents, lang):
|
|||
f_names = [f'conj::{f}' for f in conjugations]
|
||||
F = np.array(features)
|
||||
print(f'task conjugation features (#features={F.shape[1]}) [Done]')
|
||||
return F, f_names
|
||||
return {'features': F, 'f_names':f_names, 'task': 'conjugations'}
|
||||
|
||||
|
||||
def _features_Mendenhall(documents, upto=23):
|
||||
|
|
@ -170,7 +170,7 @@ def _features_Mendenhall(documents, upto=23):
|
|||
Extract features as the frequency (L1x1000) of the words' lengths used in the documents,
|
||||
following the idea behind Mendenhall's Characteristic Curve of Composition
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
|
||||
:return: a dictionary containing the resulting features, feature names, and taskname
|
||||
"""
|
||||
features = []
|
||||
for text in documents:
|
||||
|
|
@ -185,7 +185,7 @@ def _features_Mendenhall(documents, upto=23):
|
|||
f_names = [f'mendenhall::{c}' for c in range(1,upto)]
|
||||
F = np.array(features)
|
||||
print(f'task Mendenhall features (#features={F.shape[1]}) [Done]')
|
||||
return F, f_names
|
||||
return {'features': F, 'f_names':f_names, 'task': 'Mendenhall'}
|
||||
|
||||
|
||||
def _features_sentenceLengths(documents, downto=3, upto=70):
|
||||
|
|
@ -194,7 +194,7 @@ def _features_sentenceLengths(documents, downto=3, upto=70):
|
|||
:param documents: a list where each element is the text (string) of a document
|
||||
:param downto: minimal length considered
|
||||
:param upto: maximum length considered
|
||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
|
||||
:return: a dictionary containing the resulting features, feature names, and taskname
|
||||
"""
|
||||
features = []
|
||||
for text in documents:
|
||||
|
|
@ -212,15 +212,14 @@ def _features_sentenceLengths(documents, downto=3, upto=70):
|
|||
f_names = [f'sentlength::{c}' for c in range(downto, upto)]
|
||||
F = np.array(features)
|
||||
print(f'task sentence lengths (#features={F.shape[1]}) [Done]')
|
||||
return F, f_names
|
||||
return {'features': F, 'f_names':f_names, 'task': 'sentlength'}
|
||||
|
||||
|
||||
def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=1, ngrams=(1, 1)):
|
||||
"""
|
||||
Extract features as tfidf matrix extracted from the documents
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
:return: a tuple M,V, where M is an np.array of shape (D,F), with D being the len(documents) and F the number of
|
||||
distinct words; and V is the TfidfVectorizer already fit
|
||||
:return: a dictionary containing the resulting features, feature names, taskname, the vectorizer and the selector
|
||||
"""
|
||||
if vectorizer is None:
|
||||
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams)
|
||||
|
|
@ -238,7 +237,13 @@ def _features_word_ngrams(documents, vectorizer=None, selector=None, y=None, fea
|
|||
f_names = [f_names[i] for i in selector.get_support(indices=True)]
|
||||
|
||||
print(f'task ngrams and feature selection (#features={features.shape[1]}) [Done]')
|
||||
return features, f_names, vectorizer, selector
|
||||
return {
|
||||
'features': features,
|
||||
'f_names': f_names,
|
||||
'task': '_wngrams_task',
|
||||
'vectorizer': vectorizer,
|
||||
'selector': selector
|
||||
}
|
||||
|
||||
|
||||
def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, feat_sel_ratio=1., min_df=10, preserve_punctuation=True, ngrams=[4, 5]):
|
||||
|
|
@ -253,24 +258,27 @@ def _features_char_ngrams(documents, vectorizer=None, selector=None, y=None, fea
|
|||
:param vectorizer: the tfidf_vectorizer to use if already fit; if None, a new one will be instantiated and fit
|
||||
:param min_df: minumum number of occurrences needed for the ngram to be taken
|
||||
:param preserve_punctuation: whether or not to preserve punctuation marks
|
||||
:return: see _features_tfidf
|
||||
:return: a dictionary containing the resulting features, feature names, taskname, the vectorizer and the selector
|
||||
"""
|
||||
doc_ngrams = ngrams_extractor(documents, ngrams, preserve_punctuation)
|
||||
return _features_word_ngrams(
|
||||
outs = _features_word_ngrams(
|
||||
doc_ngrams,
|
||||
vectorizer=vectorizer,
|
||||
selector=selector, y=y, feat_sel_ratio=feat_sel_ratio,
|
||||
min_df=min_df
|
||||
)
|
||||
outs['task'] = '_cngrams_task'
|
||||
return outs
|
||||
|
||||
|
||||
def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True):
|
||||
if not isinstance(ns, list): ns=[ns]
|
||||
if not isinstance(ns, list):
|
||||
ns=[ns]
|
||||
ns = sorted(np.unique(ns).tolist())
|
||||
|
||||
list_ngrams = []
|
||||
for doc in documents:
|
||||
if preserve_punctuation == False:
|
||||
if not preserve_punctuation:
|
||||
doc = ' '.join(tokenize(doc))
|
||||
doc_ngrams = []
|
||||
for ni in ns:
|
||||
|
|
@ -287,7 +295,7 @@ def _feature_selection(X, y, tfidf_feat_selection_ratio):
|
|||
:param X: a document by (sparse) features matrix
|
||||
:param y: the supervised ndarray containing the class labels
|
||||
:param tfidf_feat_selection_ratio: a proportion of features to be taken
|
||||
:return: the reduced matrix and the feature selector fit
|
||||
:return: the feature selector fit
|
||||
"""
|
||||
nF = X.shape[1]
|
||||
num_feats = int(tfidf_feat_selection_ratio * nF)
|
||||
|
|
@ -321,7 +329,7 @@ class FeatureExtractor:
|
|||
window_size=5,
|
||||
verbose=True):
|
||||
"""
|
||||
Applies stlystic feature extraction. Features include:
|
||||
Applies stilystic feature extraction. Features include:
|
||||
:param function_words_freq: add the frequency of function words as features
|
||||
:param conjugations_freq: add the frequency of regular conjugations as features
|
||||
:param features_Mendenhall: add the frequencies of the words' lengths as features
|
||||
|
|
@ -437,113 +445,31 @@ class FeatureExtractor:
|
|||
self.feature_names = []
|
||||
self.feature_names.extend(feat_names)
|
||||
|
||||
def _transform(self, documents, y=None, fit=False):
|
||||
# initialize the document-by-feature vector
|
||||
X = np.empty((len(documents), 0))
|
||||
|
||||
# dense feature extraction functions
|
||||
if self.function_words_freq:
|
||||
F, f_names = _features_function_words_freq(documents, self.function_words_freq)
|
||||
X = self._addfeatures(X, F, f_names if fit else None)
|
||||
self._print(f'adding function words features: {X.shape[1]} features')
|
||||
|
||||
if self.conjugations_freq:
|
||||
F, f_names = _features_conjugations_freq(documents, self.conjugations_freq)
|
||||
X = self._addfeatures(X, F, f_names if fit else None)
|
||||
self._print(f'adding conjugation features: {X.shape[1]} features')
|
||||
|
||||
if self.features_Mendenhall:
|
||||
F, f_names = _features_Mendenhall(documents)
|
||||
X = self._addfeatures(X, F, f_names if fit else None)
|
||||
self._print(f'adding Mendenhall words features: {X.shape[1]} features')
|
||||
|
||||
if self.features_sentenceLengths:
|
||||
F, f_names = _features_sentenceLengths(documents)
|
||||
X = self._addfeatures(X, F, f_names if fit else None)
|
||||
self._print(f'adding sentence lengths features: {X.shape[1]} features')
|
||||
|
||||
# sparse feature extraction functions
|
||||
if self.wngrams:
|
||||
if fit:
|
||||
X_features, self.wngrams_vectorizer = _features_word_ngrams(documents, ngrams=self.wngrams_range)
|
||||
index2word = {i: w for w, i in self.wngrams_vectorizer.vocabulary_.items()}
|
||||
f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))]
|
||||
else:
|
||||
X_features, _ = _features_word_ngrams(documents, self.wngrams_vectorizer)
|
||||
f_names = None
|
||||
|
||||
if self.feature_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
if fit:
|
||||
X_features, self.feat_sel_tfidf = _feature_selection(X_features, y, self.feature_selection_ratio)
|
||||
f_names = [f_names[i] for i in self.feat_sel_tfidf.get_support(indices=True)]
|
||||
else:
|
||||
X_features = self.feat_sel_tfidf.transform(X_features)
|
||||
X = self._addfeatures(_tocsr(X), X_features, f_names)
|
||||
self._print(f'adding tfidf words features: {X.shape[1]} features')
|
||||
|
||||
if self.cngrams:
|
||||
if fit:
|
||||
X_features, self.cngrams_vectorizer = _features_char_ngrams(
|
||||
documents, self.cngrams_range, preserve_punctuation=self.preserve_punctuation
|
||||
)
|
||||
index2word = {i: w for w, i in self.cngrams_vectorizer.vocabulary_.items()}
|
||||
f_names = [f'ngram::{index2word[i]}' for i in range(len(index2word))]
|
||||
else:
|
||||
X_features, _ = _features_char_ngrams(
|
||||
documents, self.cngrams_range, vectorizer=self.cngrams_vectorizer,
|
||||
preserve_punctuation=self.preserve_punctuation
|
||||
)
|
||||
f_names = None
|
||||
|
||||
if self.feature_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
if fit:
|
||||
X_features, self.cngrams_selector = _feature_selection(X_features, y, self.feature_selection_ratio)
|
||||
f_names = [f_names[i] for i in self.cngrams_selector.get_support(indices=True)]
|
||||
else:
|
||||
X_features = self.cngrams_selector.transform(X_features)
|
||||
|
||||
X = self._addfeatures(_tocsr(X), X_features, f_names)
|
||||
self._print(f'adding ngrams character features: {X.shape[1]} features')
|
||||
|
||||
if fit:
|
||||
self.feature_names = np.asarray(self.feature_names)
|
||||
|
||||
self._print(f'X shape (#documents,#features): {X.shape}')
|
||||
|
||||
return X
|
||||
|
||||
def _transform_parallel(self, documents, y=None, fit=False, n_jobs=-1):
|
||||
# initialize the document-by-feature vector
|
||||
X = np.empty((len(documents), 0))
|
||||
|
||||
tasks = []
|
||||
|
||||
# dense feature extraction functions
|
||||
if self.function_words_freq:
|
||||
tasks.append((_features_function_words_freq, (documents, self.function_words_freq)))
|
||||
tasks.append((_features_function_words_freq, {'documents': documents, 'lang': self.function_words_freq}))
|
||||
|
||||
if self.conjugations_freq:
|
||||
tasks.append((_features_conjugations_freq, (documents, self.conjugations_freq)))
|
||||
tasks.append((_features_conjugations_freq, {'documents': documents, 'lang': self.conjugations_freq}))
|
||||
|
||||
if self.features_Mendenhall:
|
||||
tasks.append((_features_Mendenhall, (documents, 23)))
|
||||
tasks.append((_features_Mendenhall, {'documents': documents, 'upto': 23}))
|
||||
|
||||
if self.features_sentenceLengths:
|
||||
tasks.append((_features_sentenceLengths, (documents, 3, 70)))
|
||||
|
||||
self._print('extracting dense features in parallel')
|
||||
outs = Parallel(n_jobs=n_jobs)(delayed(task)(*params) for task, params in tasks)
|
||||
for F, feat_names in outs:
|
||||
X = self._addfeatures(X, F, feat_names if fit else None)
|
||||
tasks.append((_features_sentenceLengths, {'documents': documents, 'downto': 3, 'upto': 70}))
|
||||
|
||||
# sparse feature extraction functions
|
||||
tasks = []
|
||||
if self.wngrams:
|
||||
if not fit and self.wngrams_vectorizer is None:
|
||||
raise ValueError('transform called before fit')
|
||||
|
||||
params={
|
||||
params = {
|
||||
'documents': documents,
|
||||
'vectorizer': self.wngrams_vectorizer,
|
||||
'selector': self.wngrams_selector,
|
||||
|
|
@ -557,7 +483,7 @@ class FeatureExtractor:
|
|||
if not fit and self.cngrams_vectorizer is None:
|
||||
raise ValueError('transform called before fit')
|
||||
|
||||
params={
|
||||
params = {
|
||||
'documents': documents,
|
||||
'vectorizer': self.cngrams_vectorizer,
|
||||
'selector': self.cngrams_selector,
|
||||
|
|
@ -568,15 +494,22 @@ class FeatureExtractor:
|
|||
}
|
||||
tasks.append((_features_char_ngrams, params))
|
||||
|
||||
self._print('extracting sparse features in parallel')
|
||||
self._print('extracting features in parallel')
|
||||
outs = Parallel(n_jobs=n_jobs)(delayed(task)(**params) for task, params in tasks)
|
||||
for F, feat_names, vectorizer, selector in outs:
|
||||
X = self._addfeatures(_tocsr(X), F, feat_names if fit else None)
|
||||
if fit:
|
||||
if self.wngrams and self.wngrams_vectorizer is None:
|
||||
self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector
|
||||
elif self.cngrams and self.cngrams_vectorizer is None:
|
||||
self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector
|
||||
|
||||
# gather the tasks' outputs
|
||||
for out in outs:
|
||||
taskname = out['task']
|
||||
if taskname not in {'_wngrams_task', '_cngrams_task'}:
|
||||
X = self._addfeatures(X, out['features'], out['f_names'] if fit else None)
|
||||
else:
|
||||
X = self._addfeatures(_tocsr(X), out['features'], out['f_names'] if fit else None)
|
||||
if fit:
|
||||
vectorizer, selector = out['vectorizer'], out['selector']
|
||||
if taskname == '_wngrams_task' and self.wngrams_vectorizer is None:
|
||||
self.wngrams_vectorizer, self.wngrams_selector = vectorizer, selector
|
||||
elif taskname == '_cngrams_task' and self.cngrams_vectorizer is None:
|
||||
self.cngrams_vectorizer, self.cngrams_selector = vectorizer, selector
|
||||
|
||||
if fit:
|
||||
self.feature_names = np.asarray(self.feature_names)
|
||||
|
|
|
|||
37
src/model.py
37
src/model.py
|
|
@ -1,6 +1,6 @@
|
|||
from sklearn.metrics import make_scorer
|
||||
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||
from sklearn.svm import *
|
||||
from data.features import *
|
||||
from util.evaluation import f1, get_counters
|
||||
|
|
@ -10,29 +10,21 @@ class AuthorshipVerificator:
|
|||
|
||||
def __init__(self, nfolds=10,
|
||||
params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]},
|
||||
estimator=SVC,
|
||||
author_name=None):
|
||||
self.nfolds = nfolds
|
||||
self.params = params
|
||||
self.author_name = author_name if author_name else 'this author'
|
||||
if estimator is SVC:
|
||||
self.params['kernel'] = ['linear', 'rbf']
|
||||
self.probability = True
|
||||
self.classifier = estimator(probability=self.probability)
|
||||
elif estimator is LinearSVC:
|
||||
self.probability = False
|
||||
self.classifier = estimator()
|
||||
elif estimator is LogisticRegression:
|
||||
self.probability = True
|
||||
self.classifier = LogisticRegression()
|
||||
self.classifier = LogisticRegression()
|
||||
|
||||
def fit(self,X,y,groups=None):
|
||||
if not isinstance(y,np.ndarray): y=np.array(y)
|
||||
def fit(self, X, y):
|
||||
y = np.asarray(y)
|
||||
positive_examples = y.sum()
|
||||
if positive_examples >= self.nfolds:
|
||||
print('optimizing {}'.format(self.classifier.__class__.__name__))
|
||||
folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
|
||||
self.estimator = GridSearchCV(self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
||||
self.estimator = GridSearchCV(
|
||||
self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1
|
||||
)
|
||||
else:
|
||||
self.estimator = self.classifier
|
||||
|
||||
|
|
@ -46,7 +38,6 @@ class AuthorshipVerificator:
|
|||
return self
|
||||
|
||||
def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
|
||||
|
||||
if groups is None:
|
||||
print('Computing LOO without groups')
|
||||
folds = list(LeaveOneOut().split(X, y))
|
||||
|
|
@ -59,8 +50,8 @@ class AuthorshipVerificator:
|
|||
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
|
||||
|
||||
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
||||
missclassified = '\n'.join(files[scores==0].tolist())
|
||||
print(scores)
|
||||
missclassified = '\n'.join(files[scores == 0].tolist())
|
||||
print('missclassified texts:')
|
||||
print(missclassified)
|
||||
|
||||
if counters and test_lowest_index_only:
|
||||
|
|
@ -73,26 +64,24 @@ class AuthorshipVerificator:
|
|||
else:
|
||||
return scores.mean(), scores.std()
|
||||
|
||||
def predict(self, test, epistola_name=''):
|
||||
def predict(self, test):
|
||||
pred = self.estimator.predict(test)
|
||||
full_doc_prediction = pred[0]
|
||||
print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
|
||||
if len(pred) > 1:
|
||||
fragment_predictions = pred[1:]
|
||||
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
||||
return full_doc_prediction, fragment_predictions
|
||||
return full_doc_prediction, None
|
||||
return full_doc_prediction
|
||||
|
||||
def predict_proba(self, test, epistola_name=''):
|
||||
def predict_proba(self, test):
|
||||
assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
|
||||
pred = self.estimator.predict_proba(test)
|
||||
full_doc_prediction = pred[0,1]
|
||||
print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
|
||||
if len(pred) > 1:
|
||||
fragment_predictions = pred[1:,1]
|
||||
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
||||
return full_doc_prediction, fragment_predictions
|
||||
return full_doc_prediction, None
|
||||
return full_doc_prediction, []
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,4 @@
|
|||
def warn(*args, **kwargs):
|
||||
pass
|
||||
import warnings
|
||||
warnings.warn = warn
|
||||
Loading…
Reference in New Issue