This commit is contained in:
Alejandro Moreo Fernandez 2018-11-29 17:36:34 +01:00
parent e35f6c2e71
commit 893cc31225
8 changed files with 391 additions and 164 deletions

40
src/dante_eval.py Normal file
View File

@ -0,0 +1,40 @@
from sklearn.linear_model import LogisticRegression
from data.dante_loader import load_texts
from data.features import *
from model import AuthorshipVerificator
from sklearn.svm import LinearSVC, SVC
# DONE: ngrams should contain punctuation marks according to Sapkota et al. [39] in the PAN 2015 overview
# (More recently, it was shown that character
# n-grams corresponding to word affixes and including punctuation marks are the most
# significant features in cross-topic authorship attribution [57].)
# TODO: split policies: understand overlapping in cross-validation
path = '../testi'
positive, negative, ep1_text, ep2_text = load_texts(path)
feature_extractor = FeatureExtractor(function_words_freq='latin', features_Mendenhall=True,
tfidf=False, tfidf_feat_selection_ratio=0.1,
ngrams=True, ns=[3,4,5],
split_documents=True,
split_policy=split_by_sentences,
window_size=3,
normalize_features=True, verbose=True)
Xtr,ytr = feature_extractor.fit(positive, negative)
ep1 = feature_extractor.transform(ep1_text)
ep2 = feature_extractor.transform(ep2_text)
print('Fitting the Verificator')
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
av.fit(Xtr,ytr)
print('Predicting the Epistolas')
av.predict(ep1, 'Epistola 1')
av.predict_proba(ep1, 'Epistola 1')
av.predict(ep2, 'Epistola 2')
av.predict_proba(ep2, 'Epistola 2')

29
src/data/dante_loader.py Normal file
View File

@ -0,0 +1,29 @@
import os
from os.path import join
# ------------------------------------------------------------------------
# document loading routine
# ------------------------------------------------------------------------
def load_texts(path, positive_author='Dante'):
# load the training data (all documents but Epistolas 1 and 2)
positive,negative = [],[]
authors = []
ndocs=0
for file in os.listdir(path):
if file.startswith('EpistolaXIII_'): continue
file_clean = file.replace('.txt','')
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
text = open(join(path,file), encoding= "utf8").read()
if author == positive_author:
positive.append(text)
else:
negative.append(text)
authors.append(author)
ndocs+=1
# load the test data (Epistolas 1 and 2)
ep1_text = open(join(path, 'EpistolaXIII_1.txt'), encoding="utf8").read()
ep2_text = open(join(path, 'EpistolaXIII_2.txt'), encoding="utf8").read()
return positive, negative, ep1_text, ep2_text

View File

@ -1,56 +1,31 @@
import nltk
import re
import numpy as np
import os
from os.path import join
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize
from scipy.sparse import hstack, csr_matrix, issparse
from collections import Counter
from nltk.corpus import stopwords
function_words = ['et', 'in', 'de', 'ad', 'ut', 'cum', 'non', 'per', 'a', 'que', 'ex','sed',
'quia', 'nam', 'sic', 'si', 'ab', 'etiam', 'idest', 'nec', 'vel', 'atque',
'scilicet', 'sicut', 'hec', 'vero', 'tamen', 'dum', 'propter', 'pro', 'enim',
'ita', 'autem', 'inter', 'unde', 'sub', 'tam', 'ibi', 'ideo', 'ergo', 'post',
'iam', 'seu', 'inde', 'tantum', 'sive', 'quomodo', 'ubi', 'ac', 'ob', 'igitur',
'tunc', 'nisi', 'quasi', 'quantum', 'aut', 'usque', 'bene', 'ne', 'ante',
'nunc', 'magis', 'sine', 'circa', 'apud', 'contra', 'adhuc', 'satis', 'semper',
'super', 'adeo', 'tandem', 'tanquam', 'quoniam', 'quin', 'quemadmodum', 'supra']
nfolds = 5
# ------------------------------------------------------------------------
# document loading routine
# ------------------------------------------------------------------------
def _load_texts(path):
# load the training data (all documents but Epistolas 1 and 2)
documents = []
authors = []
ndocs=0
for file in os.listdir(path):
if file.startswith('EpistolaXIII_'): continue
file_clean = file.replace('.txt','')
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
text = open(join(path,file), encoding= "utf8").read()
documents.append(text)
authors.append(author)
ndocs+=1
# load the test data (Epistolas 1 and 2)
ep1_text = open(join(path, 'EpistolaXIII_1.txt'), encoding="utf8").read()
ep2_text = open(join(path, 'EpistolaXIII_2.txt'), encoding="utf8").read()
return documents, authors, ep1_text, ep2_text
latin_function_words = ['et', 'in', 'de', 'ad', 'ut', 'cum', 'non', 'per', 'a', 'que', 'ex','sed',
'quia', 'nam', 'sic', 'si', 'ab', 'etiam', 'idest', 'nec', 'vel', 'atque',
'scilicet', 'sicut', 'hec', 'vero', 'tamen', 'dum', 'propter', 'pro', 'enim',
'ita', 'autem', 'inter', 'unde', 'sub', 'tam', 'ibi', 'ideo', 'ergo', 'post',
'iam', 'seu', 'inde', 'tantum', 'sive', 'quomodo', 'ubi', 'ac', 'ob', 'igitur',
'tunc', 'nisi', 'quasi', 'quantum', 'aut', 'usque', 'bene', 'ne', 'ante',
'nunc', 'magis', 'sine', 'circa', 'apud', 'contra', 'adhuc', 'satis', 'semper',
'super', 'adeo', 'tandem', 'tanquam', 'quoniam', 'quin', 'quemadmodum', 'supra']
def get_function_words(lang):
if lang=='latin':
return latin_function_words
elif lang in ['english','spanish']:
return stopwords.words(lang)
else:
raise ValueError('{} not in scope!'.format(lang))
# ------------------------------------------------------------------------
# split policies
@ -78,8 +53,13 @@ def split_by_sentences(text):
def windows(text_fragments, window_size):
new_fragments = []
for i in range(len(text_fragments)-window_size+1):
new_fragments.append(' '.join(text_fragments[i:i+window_size]))
nbatches = len(text_fragments) // window_size
if len(text_fragments) % window_size > 0:
nbatches+=1
# for i in range(len(text_fragments)-window_size+1):
for i in range(nbatches):
offset = i*window_size
new_fragments.append(' '.join(text_fragments[offset:offset+window_size]))
return new_fragments
def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1):
@ -100,14 +80,14 @@ def splitter(documents, authors=None, split_policy=split_by_sentences, window_si
# ------------------------------------------------------------------------
# feature extraction methods
# ------------------------------------------------------------------------
# TODO: implement other feature extraction methods
def _features_function_words_freq(documents):
def _features_function_words_freq(documents, lang):
"""
Extract features as the frequency (x1000) of the function words used in the documents
:param documents: a list where each element is the text (string) of a document
:return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
"""
features = []
function_words = get_function_words(lang)
for text in documents:
unmod_tokens = nltk.word_tokenize(text)
@ -160,9 +140,9 @@ def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1):
return features, tfidf_vectorizer
def _features_ngrams(documents, ns=[4, 5], tfidf_vectorizer=None, min_df = 5):
def _features_ngrams(documents, ns=[4, 5], ngrams_vectorizer=None, min_df = 5):
doc_ngrams = ngrams_extractor(documents, ns)
return _features_tfidf(doc_ngrams, tfidf_vectorizer=tfidf_vectorizer, min_df = min_df)
return _features_tfidf(doc_ngrams, tfidf_vectorizer=ngrams_vectorizer, min_df = min_df)
def ngrams_extractor(documents, ns=[4, 5]):
@ -171,7 +151,7 @@ def ngrams_extractor(documents, ns=[4, 5]):
list_ngrams = []
for doc in documents:
doc = re.sub(r'[^\w\s]','', doc.strip())
# doc = re.sub(r'[^\w\s]','', doc.strip())
doc_ngrams = []
for ni in ns:
doc_ngrams.extend([doc[i:i + ni].replace(' ','_') for i in range(len(doc) - ni + 1)])
@ -181,23 +161,21 @@ def ngrams_extractor(documents, ns=[4, 5]):
return list_ngrams
def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
def _feature_selection(X, y, tfidf_feat_selection_ratio):
nF = X.shape[1]
num_feats = int(tfidf_feat_selection_ratio * nF)
feature_selector = SelectKBest(chi2, k=num_feats)
X = feature_selector.fit_transform(X, y)
EP1 = feature_selector.transform(EP1)
EP2 = feature_selector.transform(EP2)
return X,EP1,EP2
return X, feature_selector
def _tocsr(X):
return X if issparse(X) else csr_matrix(X)
class DocumentLoader:
class FeatureExtractor:
def __init__(self,
function_words_freq=True,
function_words_freq=None,
features_Mendenhall=True,
tfidf=False,
tfidf_feat_selection_ratio=1.,
@ -240,87 +218,123 @@ class DocumentLoader:
self.verbose = verbose
def load_documents(self, path):
documents, authors, ep1_text, ep2_text = _load_texts(path)
ep1,ep2 = [ep1_text],[ep2_text]
n_original_docs=len(documents)
def fit(self, positives, negatives):
documents = positives + negatives
authors = [1]*len(positives) + [0]*len(negatives)
n_original_docs = len(documents)
if self.split_documents:
doc_fragments, authors_fragments = splitter(documents, authors, split_policy=self.split_policy, window_size=self.window_size)
doc_fragments, authors_fragments = splitter(documents, authors,
split_policy=self.split_policy,
window_size=self.window_size)
documents.extend(doc_fragments)
authors.extend(authors_fragments)
ep1.extend(splitter(ep1, split_policy=self.split_policy))
ep2.extend(splitter(ep2, split_policy=self.split_policy))
self._print('splitting documents: {} documents'.format(len(doc_fragments)))
# represent the target vector
y = np.array([(1 if author == "Dante" else 0) for author in authors])
y = np.array(authors)
# initialize the document-by-feature vector
X = np.empty((len(documents), 0))
EP1 = np.empty((len(ep1), 0))
EP2 = np.empty((len(ep2), 0))
# dense feature extraction functions
if self.function_words_freq:
X = self._addfeatures(X, _features_function_words_freq(documents))
EP1 = self._addfeatures(EP1, _features_function_words_freq(ep1))
EP2 = self._addfeatures(EP2, _features_function_words_freq(ep2))
X = self._addfeatures(X, _features_function_words_freq(documents, self.function_words_freq))
self._print('adding function words features: {} features'.format(X.shape[1]))
if self.features_Mendenhall:
X = self._addfeatures(X, _features_Mendenhall(documents))
EP1 = self._addfeatures(EP1, _features_Mendenhall(ep1))
EP2 = self._addfeatures(EP2, _features_Mendenhall(ep2))
self._print('adding Mendenhall words features: {} features'.format(X.shape[1]))
# sparse feature extraction functions
if self.tfidf:
X_features, vectorizer = _features_tfidf(documents)
ep1_features, _ = _features_tfidf(ep1, vectorizer)
ep2_features, _ = _features_tfidf(ep2, vectorizer)
self.tfidf_vectorizer = vectorizer
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
X_features, ep1_features, ep2_features = \
_feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
X_features, feat_sel = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
self.feat_sel_tfidf = feat_sel
X = self._addfeatures(_tocsr(X), X_features)
EP1 = self._addfeatures(_tocsr(EP1), ep1_features)
EP2 = self._addfeatures(_tocsr(EP2), ep2_features)
X = self._addfeatures(_tocsr(X), X_features)
self._print('adding tfidf words features: {} features'.format(X.shape[1]))
if self.ngrams:
X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5*self.window_size)
ep1_features, _ = _features_ngrams(ep1, self.ns, tfidf_vectorizer=vectorizer, min_df=5*self.window_size)
ep2_features, _ = _features_ngrams(ep2, self.ns, tfidf_vectorizer=vectorizer, min_df=5*self.window_size)
X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5 * self.window_size)
self.ngrams_vectorizer = vectorizer
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
X_features, ep1_features, ep2_features = \
_feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
X_features, feat_sel = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
self.feat_sel_ngrams = feat_sel
X = self._addfeatures(_tocsr(X), X_features)
EP1 = self._addfeatures(_tocsr(EP1), ep1_features)
EP2 = self._addfeatures(_tocsr(EP2), ep2_features)
X = self._addfeatures(_tocsr(X), X_features)
self._print('adding ngrams words features: {} features'.format(X.shape[1]))
# print summary
if self.verbose:
print('load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
.format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
self.split_policy.__name__))
print(
'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
.format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
self.split_policy.__name__))
print('number of training (full) documents: {}'.format(n_original_docs))
print('X shape (#documents,#features): {}'.format(X.shape))
print('y prevalence: {:.2f}%'.format(y.mean()*100))
print('Epistola 1 shape:', EP1.shape)
print('Epistola 2 shape:', EP2.shape)
print('y prevalence: {:.2f}%'.format(y.mean() * 100))
print()
return X, y, EP1, EP2
return X, y
def transform(self, test):
test = [test]
if self.split_documents:
test.extend(splitter(test, split_policy=self.split_policy))
# initialize the document-by-feature vector
TEST = np.empty((len(test), 0))
# dense feature extraction functions
if self.function_words_freq:
TEST = self._addfeatures(TEST, _features_function_words_freq(test, self.function_words_freq))
self._print('adding function words features: {} features'.format(TEST.shape[1]))
if self.features_Mendenhall:
TEST = self._addfeatures(TEST, _features_Mendenhall(test))
self._print('adding Mendenhall words features: {} features'.format(TEST.shape[1]))
# sparse feature extraction functions
if self.tfidf:
ep1_features, _ = _features_tfidf(test, self.tfidf_vectorizer)
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
ep1_features = self.feat_sel_tfidf.transform(ep1_features)
TEST = self._addfeatures(_tocsr(TEST), ep1_features)
self._print('adding tfidf words features: {} features'.format(TEST.shape[1]))
if self.ngrams:
ep1_features, _ = _features_ngrams(test, self.ns, ngrams_vectorizer=self.ngrams_vectorizer, min_df=5 * self.window_size)
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
ep1_features = self.feat_sel_ngrams.transform(ep1_features)
TEST = self._addfeatures(_tocsr(TEST), ep1_features)
self._print('adding ngrams words features: {} features'.format(TEST.shape[1]))
# print summary
if self.verbose:
print(
'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
.format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
self.split_policy.__name__))
print('Epistola 1 shape:', TEST.shape)
print()
return TEST
def _addfeatures(self, X, F):
# plt.matshow(F[:25])

51
src/data/pan2015.py Normal file
View File

@ -0,0 +1,51 @@
import itertools
import os
from os.path import join, isdir
PATH_PAN2015 = '../pan2015'
PAN2015_TRAIN = 'pan15-authorship-verification-training-dataset-2015-04-19'
PAN2015_TEST = 'pan15-authorship-verification-test-dataset2-2015-04-19'
class Pan2015:
def __init__(self, problem, solution):
self.problem = problem
self.solution = solution
def fetch_PAN2015(corpus, lang, base_path = PATH_PAN2015):
assert corpus in ['train','test'],'unexpected corpus request'
corpus_path = join(base_path, PAN2015_TRAIN if corpus=='train' else PAN2015_TEST)
print(corpus_path)
request = {}
truth = {}
for dir in os.listdir(corpus_path):
dir_path = join(corpus_path,dir)
if isdir(dir_path) and lang in dir:
truth = [x.split() for x in open(join(dir_path,'truth.txt'), 'rt').readlines()]
truth = {problem:1 if decision == 'Y' else 0 for problem,decision in truth}
for problem_name in os.listdir(dir_path):
problem_dir = join(dir_path,problem_name)
if isdir(problem_dir):
request[problem_name] = {}
request[problem_name]['known'] = []
for doc_name in os.listdir(problem_dir):
doc_path = join(problem_dir,doc_name)
if 'unknown.txt' == doc_name:
request[problem_name]['unknown'] = open(doc_path,'rt').read()
else:
request[problem_name]['known'].append(open(doc_path, 'rt').read())
return Pan2015(request, truth)
def TaskGenerator(request_dict):
pan_problems = request_dict.problem
problems = sorted(pan_problems.keys())
for i,problem_i in enumerate(problems):
positives = pan_problems[problem_i]['known']
negatives = list(itertools.chain.from_iterable([pan_problems[problem_j]['known'] for j,problem_j in enumerate(problems) if i!=j]))
test = pan_problems[problem_i]['unknown']
yield problem_i,positives,negatives,test,request_dict.solution[problem_i]

View File

@ -1,69 +0,0 @@
import disable_sklearn_warnings
from sklearn.svm import *
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from verification import *
# TODO: other split policies
# TODO: understand normalization
# TODO: wrap into an Estimator
# TODO: check versions (numpy, scipy, sklearn)
SVM = SVC
# SVM = LinearSVC
nfolds = 10
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
if SVM is SVC:
params['kernel']=['linear','rbf']
probability = True
else:
probability = False
path = '../testi'
reader = DocumentLoader(function_words_freq=True, features_Mendenhall=True,
tfidf=True, tfidf_feat_selection_ratio=0.1,
ngrams=True, ns=[3,4,5],
split_documents=True, split_policy=split_by_sentences, normalize_features=True, window_size=1, verbose=True)
Xtr,ytr,ep1,ep2 = reader.load_documents(path)
# learn a SVM
#svm = SVM(probability=probability)
svm = SVM()
positive_examples = ytr.sum()
if positive_examples>nfolds:
print('optimizing {}'.format(svm.__class__.__name__))
svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
svm.fit(Xtr, ytr)
if isinstance(svm, GridSearchCV):
print('Best params: {}'.format(svm.best_params_))
# evaluation of results
print('computing the cross-val score')
# f1scores = cross_val_score(svm, Xtr, ytr, cv=nfolds, n_jobs=-1, scoring=make_scorer(f1_score))
f1scores = svm.best_score_
f1_mean, f1_std = f1scores.mean(), f1scores.std()
print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
# final test
def predictEpistola(ep, epistola_name):
pred = svm.predict(ep)
full_doc_prediction = pred[0]
print('{} is from Dante: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
if len(pred>0):
fragment_predictions= pred[1:]
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
if SVM is SVC and probability:
prob = svm.predict_proba(ep)[:,1]
np.set_printoptions(precision=2, linewidth=200)
print('probabilistic view: full={:.3f}, fragments average {:.3f}, fragments={}'.format(prob[0], prob[1:].mean(), prob[1:]))
print('Predicting the Epistolas')
predictEpistola(ep1, 'Epistola 1')
predictEpistola(ep2, 'Epistola 2')

77
src/model.py Normal file
View File

@ -0,0 +1,77 @@
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from util import disable_sklearn_warnings
from sklearn.linear_model import LogisticRegression
from sklearn.svm import *
from data.features import *
class RandomVerificator:
def __init__(self): pass
def fit(self,positives,negatives):
pass
def predict(self,test):
return np.random.rand()
class AuthorshipVerificator:
def __init__(self, nfolds=10,
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]},
estimator=SVC):
self.nfolds = nfolds
self.params = params
if estimator is SVC:
self.params['kernel'] = ['linear', 'rbf']
self.probability = True
self.svm = estimator(probability=self.probability)
elif estimator is LinearSVC:
self.probability = False
self.svm = estimator()
elif estimator is LogisticRegression:
self.probability = True
self.svm = LogisticRegression()
def fit(self,X,y):
if not isinstance(y,np.ndarray): y=np.array(y)
positive_examples = y.sum()
if positive_examples >= self.nfolds:
print('optimizing {}'.format(self.svm.__class__.__name__))
self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
else:
self.estimator = self.svm
self.estimator.fit(X, y)
if isinstance(self.estimator, GridSearchCV):
print('Best params: {}'.format(self.estimator.best_params_))
print('computing the cross-val score')
f1scores = self.estimator.best_score_
f1_mean, f1_std = f1scores.mean(), f1scores.std()
print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
return self
def predict(self, test, epistola_name=''):
pred = self.estimator.predict(test)
full_doc_prediction = pred[0]
print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
if len(pred) > 1:
fragment_predictions = pred[1:]
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
return full_doc_prediction, fragment_predictions
return full_doc_prediction
def predict_proba(self, test, epistola_name=''):
assert self.probability, 'svm is not calibrated'
pred = self.estimator.predict_proba(test)
full_doc_prediction = pred[0,1]
print('{} is from the same author: {}'.format(epistola_name, full_doc_prediction))
if len(pred) > 1:
fragment_predictions = pred[1:,1]
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
return full_doc_prediction, fragment_predictions
return full_doc_prediction

85
src/pan2015_eval.py Normal file
View File

@ -0,0 +1,85 @@
from joblib import Parallel
from joblib import delayed
from sklearn.linear_model import LogisticRegression
from util import disable_sklearn_warnings
from sklearn.svm import LinearSVC, SVC
from data.features import FeatureExtractor
from data.pan2015 import fetch_PAN2015, TaskGenerator
from model import AuthorshipVerificator
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score
def evaluation(y_pred, y_prob, y_true):
y_pred_array = np.array(y_pred)
y_prob_array = np.array(y_prob)
y_true_array = np.array(y_true)
acc = (y_pred_array == y_true_array).mean()
f1 = f1_score(y_true_array, y_pred_array)
auc = roc_auc_score(y_true_array, y_prob_array)
pan_eval = acc * auc
print('Accuracy = {:.3f}'.format(acc))
print('F1 = {:.3f}'.format(f1))
print('AUC = {:.3f}'.format(auc))
print('Acc*AUC = {:.3f}'.format(pan_eval))
print('true:', y_true)
print('pred:', y_pred)
return pan_eval
def doall(problem,pos,neg,test,truth):
print('[Start]{}'.format(problem))
feature_extractor = FeatureExtractor(function_words_freq=lang,
features_Mendenhall=True,
tfidf=False, tfidf_feat_selection_ratio=0.1,
ngrams=True, ns=[4, 5],
split_documents=False,
normalize_features=True,
verbose=True)
method = AuthorshipVerificator(nfolds=3, estimator=LogisticRegression)
X, y = feature_extractor.fit(pos, neg)
test = feature_extractor.transform(test)
method.fit(X, y)
prediction = method.predict(test)
if method.probability:
probability = method.predict_proba(test)
else:
probability = prediction
print('[End]{}'.format(problem))
return problem, probability, prediction, truth
# print('{}-->{:.3f} decision={}'.format(problem, probability, prediction))
# print('pred={} truth={}'.format(prediction, truth))
#
# y_prob.append(probability)
# y_pred.append(prediction)
# y_true.append(truth)
#
# acc_auc = evaluation(y_pred, y_prob, y_true)
if __name__ == '__main__':
split = 'test'
lang = 'spanish'
request = fetch_PAN2015(split, lang=lang)
with open('results_ngrams.csv', 'wt') as fo:
outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request))
y_pred, y_prob, y_true = [], [], []
for problem, probability, prediction, truth in outcomes:
fo.write('{} {:.3f}\n'.format(problem, probability))
y_pred.append(prediction)
y_prob.append(probability)
y_true.append(truth)
acc_auc = evaluation(y_pred, y_prob, y_true)
print('ACC * AUC = {:.3f}'.format(acc_auc))
print('done')