78 lines
2.8 KiB
Python
78 lines
2.8 KiB
Python
from joblib import Parallel
|
|
from joblib import delayed
|
|
from sklearn.linear_model import LogisticRegression
|
|
from util import disable_sklearn_warnings
|
|
from sklearn.svm import LinearSVC, SVC
|
|
from data.features import FeatureExtractor
|
|
from data.pan2015 import fetch_PAN2015, TaskGenerator
|
|
from model import AuthorshipVerificator
|
|
import numpy as np
|
|
from sklearn.metrics import f1_score, roc_auc_score
|
|
|
|
def evaluation(y_pred, y_prob, y_true):
|
|
y_pred_array = np.array(y_pred)
|
|
y_prob_array = np.array(y_prob)
|
|
y_true_array = np.array(y_true)
|
|
|
|
acc = (y_pred_array == y_true_array).mean()
|
|
f1 = f1_score(y_true_array, y_pred_array)
|
|
auc = roc_auc_score(y_true_array, y_prob_array)
|
|
pan_eval = acc * auc
|
|
|
|
print('Accuracy = {:.3f}'.format(acc))
|
|
print('F1 = {:.3f}'.format(f1))
|
|
print('AUC = {:.3f}'.format(auc))
|
|
print('Acc*AUC = {:.3f}'.format(pan_eval))
|
|
print('true:', y_true)
|
|
print('pred:', y_pred)
|
|
|
|
return pan_eval
|
|
|
|
|
|
def doall(problem,pos,neg,test,truth):
|
|
print('[Start]{}'.format(problem))
|
|
feature_extractor = FeatureExtractor(function_words_freq=lang,
|
|
conjugations_freq=lang,
|
|
features_Mendenhall=True,
|
|
wordngrams=False, tfidf_feat_selection_ratio=0.1,
|
|
charngrams=True, n_charngrams=[3, 4, 5],
|
|
split_documents=False,
|
|
normalize_features=True,
|
|
verbose=True)
|
|
|
|
# method = AuthorshipVerificator(nfolds=3, estimator=LogisticRegression)
|
|
method = AuthorshipVerificator(nfolds=3, estimator=LinearSVC)
|
|
|
|
X, y = feature_extractor.fit_transform(pos, neg)
|
|
test = feature_extractor.transform(test)
|
|
|
|
method.fit(X, y)
|
|
prediction = method.predict(test)
|
|
if method.probability:
|
|
probability = method.predict_proba(test)
|
|
else:
|
|
probability = prediction
|
|
|
|
print('[End]{}'.format(problem))
|
|
return problem, probability, prediction, truth
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
split = 'train'
|
|
lang = 'spanish'
|
|
request = fetch_PAN2015(split, lang=lang)
|
|
|
|
with open('results_ngrams.csv', 'wt') as fo:
|
|
outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request))
|
|
y_pred, y_prob, y_true = [], [], []
|
|
for problem, probability, prediction, truth in outcomes:
|
|
fo.write('{} {}\n'.format(problem, probability))
|
|
y_pred.append(prediction)
|
|
y_prob.append(probability)
|
|
y_true.append(truth)
|
|
acc_auc = evaluation(y_pred, y_prob, y_true)
|
|
print('ACC * AUC = {:.3f}'.format(acc_auc))
|
|
|
|
|
|
print('done') |