diff --git a/src/dante_eval.py b/src/dante_eval.py new file mode 100644 index 0000000..0eec487 --- /dev/null +++ b/src/dante_eval.py @@ -0,0 +1,40 @@ +from sklearn.linear_model import LogisticRegression +from data.dante_loader import load_texts +from data.features import * +from model import AuthorshipVerificator +from sklearn.svm import LinearSVC, SVC + +# DONE: ngrams should contain punctuation marks according to Sapkota et al. [39] in the PAN 2015 overview +# (More recently, it was shown that character +# n-grams corresponding to word affixes and including punctuation marks are the most +# significant features in cross-topic authorship attribution [57].) +# TODO: split policies: understand overlapping in cross-validation + + + +path = '../testi' + +positive, negative, ep1_text, ep2_text = load_texts(path) + +feature_extractor = FeatureExtractor(function_words_freq='latin', features_Mendenhall=True, + tfidf=False, tfidf_feat_selection_ratio=0.1, + ngrams=True, ns=[3,4,5], + split_documents=True, + split_policy=split_by_sentences, + window_size=3, + normalize_features=True, verbose=True) + +Xtr,ytr = feature_extractor.fit(positive, negative) +ep1 = feature_extractor.transform(ep1_text) +ep2 = feature_extractor.transform(ep2_text) + +print('Fitting the Verificator') +av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression) +av.fit(Xtr,ytr) + +print('Predicting the Epistolas') +av.predict(ep1, 'Epistola 1') +av.predict_proba(ep1, 'Epistola 1') + +av.predict(ep2, 'Epistola 2') +av.predict_proba(ep2, 'Epistola 2') diff --git a/src/data/dante_loader.py b/src/data/dante_loader.py new file mode 100644 index 0000000..26e1432 --- /dev/null +++ b/src/data/dante_loader.py @@ -0,0 +1,29 @@ +import os +from os.path import join + +# ------------------------------------------------------------------------ +# document loading routine +# ------------------------------------------------------------------------ +def load_texts(path, positive_author='Dante'): + # load the training data (all documents but Epistolas 1 and 2) + positive,negative = [],[] + authors = [] + ndocs=0 + for file in os.listdir(path): + if file.startswith('EpistolaXIII_'): continue + file_clean = file.replace('.txt','') + author, textname = file_clean.split('_')[0],file_clean.split('_')[1] + text = open(join(path,file), encoding= "utf8").read() + + if author == positive_author: + positive.append(text) + else: + negative.append(text) + authors.append(author) + ndocs+=1 + + # load the test data (Epistolas 1 and 2) + ep1_text = open(join(path, 'EpistolaXIII_1.txt'), encoding="utf8").read() + ep2_text = open(join(path, 'EpistolaXIII_2.txt'), encoding="utf8").read() + + return positive, negative, ep1_text, ep2_text \ No newline at end of file diff --git a/src/verification.py b/src/data/features.py similarity index 64% rename from src/verification.py rename to src/data/features.py index 99186e5..06ac20e 100644 --- a/src/verification.py +++ b/src/data/features.py @@ -1,56 +1,31 @@ import nltk -import re import numpy as np -import os -from os.path import join -from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 -from sklearn.metrics import f1_score -from sklearn.metrics import make_scorer -from sklearn.model_selection import GridSearchCV -from sklearn.model_selection import cross_val_score from sklearn.preprocessing import normalize from scipy.sparse import hstack, csr_matrix, issparse from collections import Counter +from nltk.corpus import stopwords -function_words = ['et', 'in', 'de', 'ad', 'ut', 'cum', 'non', 'per', 'a', 'que', 'ex','sed', - 'quia', 'nam', 'sic', 'si', 'ab', 'etiam', 'idest', 'nec', 'vel', 'atque', - 'scilicet', 'sicut', 'hec', 'vero', 'tamen', 'dum', 'propter', 'pro', 'enim', - 'ita', 'autem', 'inter', 'unde', 'sub', 'tam', 'ibi', 'ideo', 'ergo', 'post', - 'iam', 'seu', 'inde', 'tantum', 'sive', 'quomodo', 'ubi', 'ac', 'ob', 'igitur', - 'tunc', 'nisi', 'quasi', 'quantum', 'aut', 'usque', 'bene', 'ne', 'ante', - 'nunc', 'magis', 'sine', 'circa', 'apud', 'contra', 'adhuc', 'satis', 'semper', - 'super', 'adeo', 'tandem', 'tanquam', 'quoniam', 'quin', 'quemadmodum', 'supra'] -nfolds = 5 - -# ------------------------------------------------------------------------ -# document loading routine -# ------------------------------------------------------------------------ -def _load_texts(path): - # load the training data (all documents but Epistolas 1 and 2) - documents = [] - authors = [] - ndocs=0 - for file in os.listdir(path): - if file.startswith('EpistolaXIII_'): continue - file_clean = file.replace('.txt','') - author, textname = file_clean.split('_')[0],file_clean.split('_')[1] - text = open(join(path,file), encoding= "utf8").read() - - documents.append(text) - authors.append(author) - ndocs+=1 - - # load the test data (Epistolas 1 and 2) - ep1_text = open(join(path, 'EpistolaXIII_1.txt'), encoding="utf8").read() - ep2_text = open(join(path, 'EpistolaXIII_2.txt'), encoding="utf8").read() - - return documents, authors, ep1_text, ep2_text +latin_function_words = ['et', 'in', 'de', 'ad', 'ut', 'cum', 'non', 'per', 'a', 'que', 'ex','sed', + 'quia', 'nam', 'sic', 'si', 'ab', 'etiam', 'idest', 'nec', 'vel', 'atque', + 'scilicet', 'sicut', 'hec', 'vero', 'tamen', 'dum', 'propter', 'pro', 'enim', + 'ita', 'autem', 'inter', 'unde', 'sub', 'tam', 'ibi', 'ideo', 'ergo', 'post', + 'iam', 'seu', 'inde', 'tantum', 'sive', 'quomodo', 'ubi', 'ac', 'ob', 'igitur', + 'tunc', 'nisi', 'quasi', 'quantum', 'aut', 'usque', 'bene', 'ne', 'ante', + 'nunc', 'magis', 'sine', 'circa', 'apud', 'contra', 'adhuc', 'satis', 'semper', + 'super', 'adeo', 'tandem', 'tanquam', 'quoniam', 'quin', 'quemadmodum', 'supra'] +def get_function_words(lang): + if lang=='latin': + return latin_function_words + elif lang in ['english','spanish']: + return stopwords.words(lang) + else: + raise ValueError('{} not in scope!'.format(lang)) # ------------------------------------------------------------------------ # split policies @@ -78,8 +53,13 @@ def split_by_sentences(text): def windows(text_fragments, window_size): new_fragments = [] - for i in range(len(text_fragments)-window_size+1): - new_fragments.append(' '.join(text_fragments[i:i+window_size])) + nbatches = len(text_fragments) // window_size + if len(text_fragments) % window_size > 0: + nbatches+=1 + # for i in range(len(text_fragments)-window_size+1): + for i in range(nbatches): + offset = i*window_size + new_fragments.append(' '.join(text_fragments[offset:offset+window_size])) return new_fragments def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1): @@ -100,14 +80,14 @@ def splitter(documents, authors=None, split_policy=split_by_sentences, window_si # ------------------------------------------------------------------------ # feature extraction methods # ------------------------------------------------------------------------ -# TODO: implement other feature extraction methods -def _features_function_words_freq(documents): +def _features_function_words_freq(documents, lang): """ Extract features as the frequency (x1000) of the function words used in the documents :param documents: a list where each element is the text (string) of a document :return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words) """ features = [] + function_words = get_function_words(lang) for text in documents: unmod_tokens = nltk.word_tokenize(text) @@ -160,9 +140,9 @@ def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1): return features, tfidf_vectorizer -def _features_ngrams(documents, ns=[4, 5], tfidf_vectorizer=None, min_df = 5): +def _features_ngrams(documents, ns=[4, 5], ngrams_vectorizer=None, min_df = 5): doc_ngrams = ngrams_extractor(documents, ns) - return _features_tfidf(doc_ngrams, tfidf_vectorizer=tfidf_vectorizer, min_df = min_df) + return _features_tfidf(doc_ngrams, tfidf_vectorizer=ngrams_vectorizer, min_df = min_df) def ngrams_extractor(documents, ns=[4, 5]): @@ -171,7 +151,7 @@ def ngrams_extractor(documents, ns=[4, 5]): list_ngrams = [] for doc in documents: - doc = re.sub(r'[^\w\s]','', doc.strip()) + # doc = re.sub(r'[^\w\s]','', doc.strip()) doc_ngrams = [] for ni in ns: doc_ngrams.extend([doc[i:i + ni].replace(' ','_') for i in range(len(doc) - ni + 1)]) @@ -181,23 +161,21 @@ def ngrams_extractor(documents, ns=[4, 5]): return list_ngrams -def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio): +def _feature_selection(X, y, tfidf_feat_selection_ratio): nF = X.shape[1] num_feats = int(tfidf_feat_selection_ratio * nF) feature_selector = SelectKBest(chi2, k=num_feats) X = feature_selector.fit_transform(X, y) - EP1 = feature_selector.transform(EP1) - EP2 = feature_selector.transform(EP2) - return X,EP1,EP2 - + return X, feature_selector def _tocsr(X): return X if issparse(X) else csr_matrix(X) -class DocumentLoader: + +class FeatureExtractor: def __init__(self, - function_words_freq=True, + function_words_freq=None, features_Mendenhall=True, tfidf=False, tfidf_feat_selection_ratio=1., @@ -240,87 +218,123 @@ class DocumentLoader: self.verbose = verbose - def load_documents(self, path): - documents, authors, ep1_text, ep2_text = _load_texts(path) - ep1,ep2 = [ep1_text],[ep2_text] - n_original_docs=len(documents) + def fit(self, positives, negatives): + documents = positives + negatives + authors = [1]*len(positives) + [0]*len(negatives) + n_original_docs = len(documents) if self.split_documents: - doc_fragments, authors_fragments = splitter(documents, authors, split_policy=self.split_policy, window_size=self.window_size) + doc_fragments, authors_fragments = splitter(documents, authors, + split_policy=self.split_policy, + window_size=self.window_size) documents.extend(doc_fragments) authors.extend(authors_fragments) - - ep1.extend(splitter(ep1, split_policy=self.split_policy)) - ep2.extend(splitter(ep2, split_policy=self.split_policy)) self._print('splitting documents: {} documents'.format(len(doc_fragments))) # represent the target vector - y = np.array([(1 if author == "Dante" else 0) for author in authors]) + y = np.array(authors) # initialize the document-by-feature vector X = np.empty((len(documents), 0)) - EP1 = np.empty((len(ep1), 0)) - EP2 = np.empty((len(ep2), 0)) # dense feature extraction functions if self.function_words_freq: - X = self._addfeatures(X, _features_function_words_freq(documents)) - EP1 = self._addfeatures(EP1, _features_function_words_freq(ep1)) - EP2 = self._addfeatures(EP2, _features_function_words_freq(ep2)) + X = self._addfeatures(X, _features_function_words_freq(documents, self.function_words_freq)) self._print('adding function words features: {} features'.format(X.shape[1])) if self.features_Mendenhall: X = self._addfeatures(X, _features_Mendenhall(documents)) - EP1 = self._addfeatures(EP1, _features_Mendenhall(ep1)) - EP2 = self._addfeatures(EP2, _features_Mendenhall(ep2)) self._print('adding Mendenhall words features: {} features'.format(X.shape[1])) - # sparse feature extraction functions if self.tfidf: X_features, vectorizer = _features_tfidf(documents) - ep1_features, _ = _features_tfidf(ep1, vectorizer) - ep2_features, _ = _features_tfidf(ep2, vectorizer) + self.tfidf_vectorizer = vectorizer if self.tfidf_feat_selection_ratio < 1.: if self.verbose: print('feature selection') - X_features, ep1_features, ep2_features = \ - _feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio) + X_features, feat_sel = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio) + self.feat_sel_tfidf = feat_sel - X = self._addfeatures(_tocsr(X), X_features) - EP1 = self._addfeatures(_tocsr(EP1), ep1_features) - EP2 = self._addfeatures(_tocsr(EP2), ep2_features) + X = self._addfeatures(_tocsr(X), X_features) self._print('adding tfidf words features: {} features'.format(X.shape[1])) if self.ngrams: - X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5*self.window_size) - ep1_features, _ = _features_ngrams(ep1, self.ns, tfidf_vectorizer=vectorizer, min_df=5*self.window_size) - ep2_features, _ = _features_ngrams(ep2, self.ns, tfidf_vectorizer=vectorizer, min_df=5*self.window_size) + X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5 * self.window_size) + self.ngrams_vectorizer = vectorizer if self.tfidf_feat_selection_ratio < 1.: if self.verbose: print('feature selection') - X_features, ep1_features, ep2_features = \ - _feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio) + X_features, feat_sel = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio) + self.feat_sel_ngrams = feat_sel - X = self._addfeatures(_tocsr(X), X_features) - EP1 = self._addfeatures(_tocsr(EP1), ep1_features) - EP2 = self._addfeatures(_tocsr(EP2), ep2_features) + X = self._addfeatures(_tocsr(X), X_features) self._print('adding ngrams words features: {} features'.format(X.shape[1])) - # print summary if self.verbose: - print('load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}' - .format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents, - self.split_policy.__name__)) + print( + 'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}' + .format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents, + self.split_policy.__name__)) print('number of training (full) documents: {}'.format(n_original_docs)) print('X shape (#documents,#features): {}'.format(X.shape)) - print('y prevalence: {:.2f}%'.format(y.mean()*100)) - print('Epistola 1 shape:', EP1.shape) - print('Epistola 2 shape:', EP2.shape) + print('y prevalence: {:.2f}%'.format(y.mean() * 100)) print() - return X, y, EP1, EP2 + return X, y + + + def transform(self, test): + test = [test] + + if self.split_documents: + test.extend(splitter(test, split_policy=self.split_policy)) + + # initialize the document-by-feature vector + TEST = np.empty((len(test), 0)) + + # dense feature extraction functions + if self.function_words_freq: + TEST = self._addfeatures(TEST, _features_function_words_freq(test, self.function_words_freq)) + self._print('adding function words features: {} features'.format(TEST.shape[1])) + + if self.features_Mendenhall: + TEST = self._addfeatures(TEST, _features_Mendenhall(test)) + self._print('adding Mendenhall words features: {} features'.format(TEST.shape[1])) + + # sparse feature extraction functions + if self.tfidf: + ep1_features, _ = _features_tfidf(test, self.tfidf_vectorizer) + + if self.tfidf_feat_selection_ratio < 1.: + if self.verbose: print('feature selection') + ep1_features = self.feat_sel_tfidf.transform(ep1_features) + + TEST = self._addfeatures(_tocsr(TEST), ep1_features) + self._print('adding tfidf words features: {} features'.format(TEST.shape[1])) + + if self.ngrams: + ep1_features, _ = _features_ngrams(test, self.ns, ngrams_vectorizer=self.ngrams_vectorizer, min_df=5 * self.window_size) + + if self.tfidf_feat_selection_ratio < 1.: + if self.verbose: print('feature selection') + ep1_features = self.feat_sel_ngrams.transform(ep1_features) + + TEST = self._addfeatures(_tocsr(TEST), ep1_features) + self._print('adding ngrams words features: {} features'.format(TEST.shape[1])) + + # print summary + if self.verbose: + print( + 'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}' + .format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents, + self.split_policy.__name__)) + print('Epistola 1 shape:', TEST.shape) + print() + + return TEST + def _addfeatures(self, X, F): # plt.matshow(F[:25]) diff --git a/src/data/pan2015.py b/src/data/pan2015.py new file mode 100644 index 0000000..5e18d47 --- /dev/null +++ b/src/data/pan2015.py @@ -0,0 +1,51 @@ +import itertools +import os +from os.path import join, isdir + +PATH_PAN2015 = '../pan2015' +PAN2015_TRAIN = 'pan15-authorship-verification-training-dataset-2015-04-19' +PAN2015_TEST = 'pan15-authorship-verification-test-dataset2-2015-04-19' + +class Pan2015: + def __init__(self, problem, solution): + self.problem = problem + self.solution = solution + +def fetch_PAN2015(corpus, lang, base_path = PATH_PAN2015): + assert corpus in ['train','test'],'unexpected corpus request' + + corpus_path = join(base_path, PAN2015_TRAIN if corpus=='train' else PAN2015_TEST) + + print(corpus_path) + request = {} + truth = {} + for dir in os.listdir(corpus_path): + dir_path = join(corpus_path,dir) + if isdir(dir_path) and lang in dir: + truth = [x.split() for x in open(join(dir_path,'truth.txt'), 'rt').readlines()] + truth = {problem:1 if decision == 'Y' else 0 for problem,decision in truth} + for problem_name in os.listdir(dir_path): + problem_dir = join(dir_path,problem_name) + if isdir(problem_dir): + request[problem_name] = {} + request[problem_name]['known'] = [] + for doc_name in os.listdir(problem_dir): + doc_path = join(problem_dir,doc_name) + if 'unknown.txt' == doc_name: + request[problem_name]['unknown'] = open(doc_path,'rt').read() + else: + request[problem_name]['known'].append(open(doc_path, 'rt').read()) + + return Pan2015(request, truth) + +def TaskGenerator(request_dict): + pan_problems = request_dict.problem + problems = sorted(pan_problems.keys()) + for i,problem_i in enumerate(problems): + positives = pan_problems[problem_i]['known'] + negatives = list(itertools.chain.from_iterable([pan_problems[problem_j]['known'] for j,problem_j in enumerate(problems) if i!=j])) + test = pan_problems[problem_i]['unknown'] + yield problem_i,positives,negatives,test,request_dict.solution[problem_i] + + + diff --git a/src/main.py b/src/main.py deleted file mode 100644 index ed9c514..0000000 --- a/src/main.py +++ /dev/null @@ -1,69 +0,0 @@ -import disable_sklearn_warnings -from sklearn.svm import * -from sklearn.model_selection import cross_val_score, GridSearchCV -from sklearn.metrics import f1_score, make_scorer -from verification import * - -# TODO: other split policies -# TODO: understand normalization -# TODO: wrap into an Estimator -# TODO: check versions (numpy, scipy, sklearn) - - -SVM = SVC -# SVM = LinearSVC - -nfolds = 10 -params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]} -if SVM is SVC: - params['kernel']=['linear','rbf'] - probability = True -else: - probability = False - -path = '../testi' - -reader = DocumentLoader(function_words_freq=True, features_Mendenhall=True, - tfidf=True, tfidf_feat_selection_ratio=0.1, - ngrams=True, ns=[3,4,5], - split_documents=True, split_policy=split_by_sentences, normalize_features=True, window_size=1, verbose=True) - -Xtr,ytr,ep1,ep2 = reader.load_documents(path) - -# learn a SVM -#svm = SVM(probability=probability) -svm = SVM() - -positive_examples = ytr.sum() -if positive_examples>nfolds: - print('optimizing {}'.format(svm.__class__.__name__)) - svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score), n_jobs=-1) - -svm.fit(Xtr, ytr) - -if isinstance(svm, GridSearchCV): - print('Best params: {}'.format(svm.best_params_)) - -# evaluation of results -print('computing the cross-val score') -# f1scores = cross_val_score(svm, Xtr, ytr, cv=nfolds, n_jobs=-1, scoring=make_scorer(f1_score)) -f1scores = svm.best_score_ -f1_mean, f1_std = f1scores.mean(), f1scores.std() -print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std)) - -# final test -def predictEpistola(ep, epistola_name): - pred = svm.predict(ep) - full_doc_prediction = pred[0] - print('{} is from Dante: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No')) - if len(pred>0): - fragment_predictions= pred[1:] - print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) - if SVM is SVC and probability: - prob = svm.predict_proba(ep)[:,1] - np.set_printoptions(precision=2, linewidth=200) - print('probabilistic view: full={:.3f}, fragments average {:.3f}, fragments={}'.format(prob[0], prob[1:].mean(), prob[1:])) - -print('Predicting the Epistolas') -predictEpistola(ep1, 'Epistola 1') -predictEpistola(ep2, 'Epistola 2') diff --git a/src/model.py b/src/model.py new file mode 100644 index 0000000..a0d6ddb --- /dev/null +++ b/src/model.py @@ -0,0 +1,77 @@ +from sklearn.metrics import f1_score +from sklearn.metrics import make_scorer +from sklearn.model_selection import GridSearchCV + +from util import disable_sklearn_warnings +from sklearn.linear_model import LogisticRegression +from sklearn.svm import * +from data.features import * + +class RandomVerificator: + def __init__(self): pass + def fit(self,positives,negatives): + pass + def predict(self,test): + return np.random.rand() + +class AuthorshipVerificator: + + def __init__(self, nfolds=10, + params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}, + estimator=SVC): + self.nfolds = nfolds + self.params = params + if estimator is SVC: + self.params['kernel'] = ['linear', 'rbf'] + self.probability = True + self.svm = estimator(probability=self.probability) + elif estimator is LinearSVC: + self.probability = False + self.svm = estimator() + elif estimator is LogisticRegression: + self.probability = True + self.svm = LogisticRegression() + + def fit(self,X,y): + if not isinstance(y,np.ndarray): y=np.array(y) + positive_examples = y.sum() + if positive_examples >= self.nfolds: + print('optimizing {}'.format(self.svm.__class__.__name__)) + self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1_score), n_jobs=-1) + else: + self.estimator = self.svm + + self.estimator.fit(X, y) + + if isinstance(self.estimator, GridSearchCV): + print('Best params: {}'.format(self.estimator.best_params_)) + print('computing the cross-val score') + f1scores = self.estimator.best_score_ + f1_mean, f1_std = f1scores.mean(), f1scores.std() + print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std)) + + return self + + def predict(self, test, epistola_name=''): + pred = self.estimator.predict(test) + full_doc_prediction = pred[0] + print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No')) + if len(pred) > 1: + fragment_predictions = pred[1:] + print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) + return full_doc_prediction, fragment_predictions + return full_doc_prediction + + def predict_proba(self, test, epistola_name=''): + assert self.probability, 'svm is not calibrated' + pred = self.estimator.predict_proba(test) + full_doc_prediction = pred[0,1] + print('{} is from the same author: {}'.format(epistola_name, full_doc_prediction)) + if len(pred) > 1: + fragment_predictions = pred[1:,1] + print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) + return full_doc_prediction, fragment_predictions + return full_doc_prediction + + + diff --git a/src/pan2015_eval.py b/src/pan2015_eval.py new file mode 100644 index 0000000..6dcb4c9 --- /dev/null +++ b/src/pan2015_eval.py @@ -0,0 +1,85 @@ +from joblib import Parallel +from joblib import delayed +from sklearn.linear_model import LogisticRegression +from util import disable_sklearn_warnings +from sklearn.svm import LinearSVC, SVC +from data.features import FeatureExtractor +from data.pan2015 import fetch_PAN2015, TaskGenerator +from model import AuthorshipVerificator +import numpy as np +from sklearn.metrics import f1_score, roc_auc_score + +def evaluation(y_pred, y_prob, y_true): + y_pred_array = np.array(y_pred) + y_prob_array = np.array(y_prob) + y_true_array = np.array(y_true) + + acc = (y_pred_array == y_true_array).mean() + f1 = f1_score(y_true_array, y_pred_array) + auc = roc_auc_score(y_true_array, y_prob_array) + pan_eval = acc * auc + + print('Accuracy = {:.3f}'.format(acc)) + print('F1 = {:.3f}'.format(f1)) + print('AUC = {:.3f}'.format(auc)) + print('Acc*AUC = {:.3f}'.format(pan_eval)) + print('true:', y_true) + print('pred:', y_pred) + + return pan_eval + + +def doall(problem,pos,neg,test,truth): + print('[Start]{}'.format(problem)) + feature_extractor = FeatureExtractor(function_words_freq=lang, + features_Mendenhall=True, + tfidf=False, tfidf_feat_selection_ratio=0.1, + ngrams=True, ns=[4, 5], + split_documents=False, + normalize_features=True, + verbose=True) + + method = AuthorshipVerificator(nfolds=3, estimator=LogisticRegression) + + X, y = feature_extractor.fit(pos, neg) + test = feature_extractor.transform(test) + + method.fit(X, y) + prediction = method.predict(test) + if method.probability: + probability = method.predict_proba(test) + else: + probability = prediction + + print('[End]{}'.format(problem)) + return problem, probability, prediction, truth + + # print('{}-->{:.3f} decision={}'.format(problem, probability, prediction)) + # print('pred={} truth={}'.format(prediction, truth)) + # + # y_prob.append(probability) + # y_pred.append(prediction) + # y_true.append(truth) + # + # acc_auc = evaluation(y_pred, y_prob, y_true) + + + +if __name__ == '__main__': + split = 'test' + lang = 'spanish' + request = fetch_PAN2015(split, lang=lang) + + with open('results_ngrams.csv', 'wt') as fo: + outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request)) + y_pred, y_prob, y_true = [], [], [] + for problem, probability, prediction, truth in outcomes: + fo.write('{} {:.3f}\n'.format(problem, probability)) + y_pred.append(prediction) + y_prob.append(probability) + y_true.append(truth) + acc_auc = evaluation(y_pred, y_prob, y_true) + print('ACC * AUC = {:.3f}'.format(acc_auc)) + + + print('done') \ No newline at end of file diff --git a/src/disable_sklearn_warnings.py b/src/util/disable_sklearn_warnings.py similarity index 100% rename from src/disable_sklearn_warnings.py rename to src/util/disable_sklearn_warnings.py