dante-verification/src/pan2015_eval.py

78 lines
2.8 KiB
Python

from joblib import Parallel
from joblib import delayed
from sklearn.linear_model import LogisticRegression
from util import disable_sklearn_warnings
from sklearn.svm import LinearSVC, SVC
from data.features import FeatureExtractor
from data.pan2015 import fetch_PAN2015, TaskGenerator
from model import AuthorshipVerificator
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score
def evaluation(y_pred, y_prob, y_true):
y_pred_array = np.array(y_pred)
y_prob_array = np.array(y_prob)
y_true_array = np.array(y_true)
acc = (y_pred_array == y_true_array).mean()
f1 = f1_score(y_true_array, y_pred_array)
auc = roc_auc_score(y_true_array, y_prob_array)
pan_eval = acc * auc
print('Accuracy = {:.3f}'.format(acc))
print('F1 = {:.3f}'.format(f1))
print('AUC = {:.3f}'.format(auc))
print('Acc*AUC = {:.3f}'.format(pan_eval))
print('true:', y_true)
print('pred:', y_pred)
return pan_eval
def doall(problem,pos,neg,test,truth):
print('[Start]{}'.format(problem))
feature_extractor = FeatureExtractor(function_words_freq=lang,
conjugations_freq=lang,
features_Mendenhall=True,
wordngrams=False, tfidf_feat_selection_ratio=0.1,
charngrams=True, n_charngrams=[3, 4, 5],
split_documents=False,
normalize_features=True,
verbose=True)
# method = AuthorshipVerificator(nfolds=3, estimator=LogisticRegression)
method = AuthorshipVerificator(nfolds=3, estimator=LinearSVC)
X, y = feature_extractor.fit_transform(pos, neg)
test = feature_extractor.transform(test)
method.fit(X, y)
prediction = method.predict(test)
if method.probability:
probability = method.predict_proba(test)
else:
probability = prediction
print('[End]{}'.format(problem))
return problem, probability, prediction, truth
if __name__ == '__main__':
split = 'train'
lang = 'spanish'
request = fetch_PAN2015(split, lang=lang)
with open('results_ngrams.csv', 'wt') as fo:
outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request))
y_pred, y_prob, y_true = [], [], []
for problem, probability, prediction, truth in outcomes:
fo.write('{} {}\n'.format(problem, probability))
y_pred.append(prediction)
y_prob.append(probability)
y_true.append(truth)
acc_auc = evaluation(y_pred, y_prob, y_true)
print('ACC * AUC = {:.3f}'.format(acc_auc))
print('done')