pan 2015

2018-11-29 17:36:34 +01:00 · 2018-11-29 17:36:34 +01:00 · 893cc31225
parent e35f6c2e71
commit 893cc31225
8 changed files with 391 additions and 164 deletions
--- a/src/dante_eval.py
+++ b/src/dante_eval.py
@ -0,0 +1,40 @@
+from sklearn.linear_model import LogisticRegression
+from data.dante_loader import load_texts
+from data.features import *
+from model import AuthorshipVerificator
+from sklearn.svm import LinearSVC, SVC
+
+# DONE: ngrams should contain punctuation marks according to Sapkota et al. [39] in the PAN 2015 overview
+# (More recently, it was shown that character
+# n-grams corresponding to word affixes and including punctuation marks are the most
+# significant features in cross-topic authorship attribution [57].)
+# TODO: split policies: understand overlapping in cross-validation
+
+
+
+path = '../testi'
+
+positive, negative, ep1_text, ep2_text = load_texts(path)
+
+feature_extractor = FeatureExtractor(function_words_freq='latin', features_Mendenhall=True,
+                                     tfidf=False, tfidf_feat_selection_ratio=0.1,
+                                     ngrams=True, ns=[3,4,5],
+                                     split_documents=True,
+                                     split_policy=split_by_sentences,
+                                     window_size=3,
+                                     normalize_features=True,  verbose=True)
+
+Xtr,ytr = feature_extractor.fit(positive, negative)
+ep1 = feature_extractor.transform(ep1_text)
+ep2 = feature_extractor.transform(ep2_text)
+
+print('Fitting the Verificator')
+av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
+av.fit(Xtr,ytr)
+
+print('Predicting the Epistolas')
+av.predict(ep1, 'Epistola 1')
+av.predict_proba(ep1, 'Epistola 1')
+
+av.predict(ep2, 'Epistola 2')
+av.predict_proba(ep2, 'Epistola 2')
--- a/src/data/dante_loader.py
+++ b/src/data/dante_loader.py
@ -0,0 +1,29 @@
+import os
+from os.path import join
+
+# ------------------------------------------------------------------------
+# document loading routine
+# ------------------------------------------------------------------------
+def load_texts(path, positive_author='Dante'):
+    # load the training data (all documents but Epistolas 1 and 2)
+    positive,negative = [],[]
+    authors   = []
+    ndocs=0
+    for file in os.listdir(path):
+        if file.startswith('EpistolaXIII_'): continue
+        file_clean = file.replace('.txt','')
+        author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
+        text = open(join(path,file), encoding= "utf8").read()
+
+        if author == positive_author:
+            positive.append(text)
+        else:
+            negative.append(text)
+        authors.append(author)
+        ndocs+=1
+
+    # load the test data (Epistolas 1 and 2)
+    ep1_text = open(join(path, 'EpistolaXIII_1.txt'), encoding="utf8").read()
+    ep2_text = open(join(path, 'EpistolaXIII_2.txt'), encoding="utf8").read()
+
+    return positive, negative, ep1_text, ep2_text
--- a/src/data/features.py
+++ b/src/data/features.py
@ -1,56 +1,31 @@
 import nltk
-import re
 import numpy as np
-import os
-from os.path import join
-from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_selection import SelectKBest
 from sklearn.feature_selection import chi2
-from sklearn.metrics import f1_score
-from sklearn.metrics import make_scorer
-from sklearn.model_selection import GridSearchCV
-from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import normalize
 from scipy.sparse import hstack, csr_matrix, issparse
 from collections import Counter
+from nltk.corpus import stopwords


-function_words = ['et', 'in', 'de', 'ad', 'ut', 'cum', 'non', 'per', 'a', 'que', 'ex','sed',
-                  'quia', 'nam', 'sic', 'si', 'ab', 'etiam', 'idest', 'nec', 'vel', 'atque',
-                  'scilicet', 'sicut', 'hec', 'vero', 'tamen', 'dum', 'propter', 'pro', 'enim',
-                  'ita', 'autem', 'inter', 'unde', 'sub', 'tam', 'ibi', 'ideo', 'ergo', 'post',
-                  'iam', 'seu', 'inde', 'tantum', 'sive', 'quomodo', 'ubi', 'ac', 'ob', 'igitur',
-                  'tunc', 'nisi', 'quasi', 'quantum', 'aut', 'usque', 'bene', 'ne', 'ante', 
-                  'nunc', 'magis', 'sine', 'circa', 'apud', 'contra', 'adhuc', 'satis', 'semper',
-                  'super', 'adeo', 'tandem', 'tanquam', 'quoniam', 'quin', 'quemadmodum', 'supra']

-nfolds = 5
-
-# ------------------------------------------------------------------------
-# document loading routine
-# ------------------------------------------------------------------------
-def _load_texts(path):
-    # load the training data (all documents but Epistolas 1 and 2)
-    documents = []
-    authors   = []
-    ndocs=0
-    for file in os.listdir(path):
-        if file.startswith('EpistolaXIII_'): continue
-        file_clean = file.replace('.txt','')
-        author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
-        text = open(join(path,file), encoding= "utf8").read()
-
-        documents.append(text)
-        authors.append(author)
-        ndocs+=1
-
-    # load the test data (Epistolas 1 and 2)
-    ep1_text = open(join(path, 'EpistolaXIII_1.txt'), encoding="utf8").read()
-    ep2_text = open(join(path, 'EpistolaXIII_2.txt'), encoding="utf8").read()
-
-    return documents, authors, ep1_text, ep2_text
+latin_function_words = ['et', 'in', 'de', 'ad', 'ut', 'cum', 'non', 'per', 'a', 'que', 'ex','sed',
+      'quia', 'nam', 'sic', 'si', 'ab', 'etiam', 'idest', 'nec', 'vel', 'atque',
+      'scilicet', 'sicut', 'hec', 'vero', 'tamen', 'dum', 'propter', 'pro', 'enim',
+      'ita', 'autem', 'inter', 'unde', 'sub', 'tam', 'ibi', 'ideo', 'ergo', 'post',
+      'iam', 'seu', 'inde', 'tantum', 'sive', 'quomodo', 'ubi', 'ac', 'ob', 'igitur',
+      'tunc', 'nisi', 'quasi', 'quantum', 'aut', 'usque', 'bene', 'ne', 'ante',
+      'nunc', 'magis', 'sine', 'circa', 'apud', 'contra', 'adhuc', 'satis', 'semper',
+      'super', 'adeo', 'tandem', 'tanquam', 'quoniam', 'quin', 'quemadmodum', 'supra']

+def get_function_words(lang):
+    if lang=='latin':
+        return latin_function_words
+    elif lang in ['english','spanish']:
+        return stopwords.words(lang)
+    else:
+        raise ValueError('{} not in scope!'.format(lang))

 # ------------------------------------------------------------------------
 # split policies
@ -78,8 +53,13 @@ def split_by_sentences(text):

 def windows(text_fragments, window_size):
    new_fragments = []
-    for i in range(len(text_fragments)-window_size+1):
-        new_fragments.append(' '.join(text_fragments[i:i+window_size]))
+    nbatches = len(text_fragments) // window_size
+    if len(text_fragments) % window_size > 0:
+        nbatches+=1
+    # for i in range(len(text_fragments)-window_size+1):
+    for i in range(nbatches):
+        offset = i*window_size
+        new_fragments.append(' '.join(text_fragments[offset:offset+window_size]))
    return new_fragments

 def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1):
@ -100,14 +80,14 @@ def splitter(documents, authors=None, split_policy=split_by_sentences, window_si
 # ------------------------------------------------------------------------
 # feature extraction methods
 # ------------------------------------------------------------------------
-# TODO: implement other feature extraction methods
-def _features_function_words_freq(documents):
+def _features_function_words_freq(documents, lang):
    """
    Extract features as the frequency (x1000) of the function words used in the documents
    :param documents: a list where each element is the text (string) of a document
    :return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
    """
    features = []
+    function_words = get_function_words(lang)

    for text in documents:
        unmod_tokens = nltk.word_tokenize(text)
@ -160,9 +140,9 @@ def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1):
    return features, tfidf_vectorizer


-def _features_ngrams(documents, ns=[4, 5], tfidf_vectorizer=None, min_df = 5):
+def _features_ngrams(documents, ns=[4, 5], ngrams_vectorizer=None, min_df = 5):
    doc_ngrams = ngrams_extractor(documents, ns)
-    return _features_tfidf(doc_ngrams, tfidf_vectorizer=tfidf_vectorizer, min_df = min_df)
+    return _features_tfidf(doc_ngrams, tfidf_vectorizer=ngrams_vectorizer, min_df = min_df)


 def ngrams_extractor(documents, ns=[4, 5]):
@ -171,7 +151,7 @@ def ngrams_extractor(documents, ns=[4, 5]):

    list_ngrams = []
    for doc in documents:
-        doc = re.sub(r'[^\w\s]','', doc.strip())
+        # doc = re.sub(r'[^\w\s]','', doc.strip())
        doc_ngrams = []
        for ni in ns:
            doc_ngrams.extend([doc[i:i + ni].replace(' ','_') for i in range(len(doc) - ni + 1)])
@ -181,23 +161,21 @@ def ngrams_extractor(documents, ns=[4, 5]):
    return list_ngrams


-def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
+def _feature_selection(X, y, tfidf_feat_selection_ratio):
    nF = X.shape[1]
    num_feats = int(tfidf_feat_selection_ratio * nF)
    feature_selector = SelectKBest(chi2, k=num_feats)
    X = feature_selector.fit_transform(X, y)
-    EP1 = feature_selector.transform(EP1)
-    EP2 = feature_selector.transform(EP2)
-    return X,EP1,EP2
-
+    return X, feature_selector

 def _tocsr(X):
    return X if issparse(X) else csr_matrix(X)

-class DocumentLoader:
+
+class FeatureExtractor:

    def __init__(self,
-                 function_words_freq=True,
+                 function_words_freq=None,
                 features_Mendenhall=True,
                 tfidf=False,
                 tfidf_feat_selection_ratio=1.,
@ -240,87 +218,123 @@ class DocumentLoader:
        self.verbose = verbose


-    def load_documents(self, path):
-        documents, authors, ep1_text, ep2_text = _load_texts(path)
-        ep1,ep2 = [ep1_text],[ep2_text]
-        n_original_docs=len(documents)
+    def fit(self, positives, negatives):
+        documents = positives + negatives
+        authors = [1]*len(positives) + [0]*len(negatives)
+        n_original_docs = len(documents)

        if self.split_documents:
-            doc_fragments, authors_fragments = splitter(documents, authors, split_policy=self.split_policy, window_size=self.window_size)
+            doc_fragments, authors_fragments = splitter(documents, authors,
+                                                        split_policy=self.split_policy,
+                                                        window_size=self.window_size)
            documents.extend(doc_fragments)
            authors.extend(authors_fragments)
-
-            ep1.extend(splitter(ep1, split_policy=self.split_policy))
-            ep2.extend(splitter(ep2, split_policy=self.split_policy))
            self._print('splitting documents: {} documents'.format(len(doc_fragments)))

        # represent the target vector
-        y = np.array([(1 if author == "Dante" else 0) for author in authors])
+        y = np.array(authors)

        # initialize the document-by-feature vector
        X = np.empty((len(documents), 0))
-        EP1 = np.empty((len(ep1), 0))
-        EP2 = np.empty((len(ep2), 0))

        # dense feature extraction functions
        if self.function_words_freq:
-            X = self._addfeatures(X, _features_function_words_freq(documents))
-            EP1 = self._addfeatures(EP1, _features_function_words_freq(ep1))
-            EP2 = self._addfeatures(EP2, _features_function_words_freq(ep2))
+            X = self._addfeatures(X, _features_function_words_freq(documents, self.function_words_freq))
            self._print('adding function words features: {} features'.format(X.shape[1]))

        if self.features_Mendenhall:
            X = self._addfeatures(X, _features_Mendenhall(documents))
-            EP1 = self._addfeatures(EP1, _features_Mendenhall(ep1))
-            EP2 = self._addfeatures(EP2, _features_Mendenhall(ep2))
            self._print('adding Mendenhall words features: {} features'.format(X.shape[1]))

-
        # sparse feature extraction functions
        if self.tfidf:
            X_features, vectorizer = _features_tfidf(documents)
-            ep1_features, _ = _features_tfidf(ep1, vectorizer)
-            ep2_features, _ = _features_tfidf(ep2, vectorizer)
+            self.tfidf_vectorizer = vectorizer

            if self.tfidf_feat_selection_ratio < 1.:
                if self.verbose: print('feature selection')
-                X_features, ep1_features, ep2_features = \
-                    _feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
+                X_features, feat_sel = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
+                self.feat_sel_tfidf = feat_sel

-            X   = self._addfeatures(_tocsr(X), X_features)
-            EP1 = self._addfeatures(_tocsr(EP1), ep1_features)
-            EP2 = self._addfeatures(_tocsr(EP2), ep2_features)
+            X = self._addfeatures(_tocsr(X), X_features)
            self._print('adding tfidf words features: {} features'.format(X.shape[1]))

        if self.ngrams:
-            X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5*self.window_size)
-            ep1_features, _ = _features_ngrams(ep1, self.ns, tfidf_vectorizer=vectorizer, min_df=5*self.window_size)
-            ep2_features, _ = _features_ngrams(ep2, self.ns, tfidf_vectorizer=vectorizer, min_df=5*self.window_size)
+            X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5 * self.window_size)
+            self.ngrams_vectorizer = vectorizer

            if self.tfidf_feat_selection_ratio < 1.:
                if self.verbose: print('feature selection')
-                X_features, ep1_features, ep2_features = \
-                    _feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
+                X_features, feat_sel = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
+                self.feat_sel_ngrams = feat_sel

-            X   = self._addfeatures(_tocsr(X), X_features)
-            EP1 = self._addfeatures(_tocsr(EP1), ep1_features)
-            EP2 = self._addfeatures(_tocsr(EP2), ep2_features)
+            X = self._addfeatures(_tocsr(X), X_features)
            self._print('adding ngrams words features: {} features'.format(X.shape[1]))

-
        # print summary
        if self.verbose:
-            print('load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
-                  .format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
-                          self.split_policy.__name__))
+            print(
+                'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
+                .format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
+                        self.split_policy.__name__))
            print('number of training (full) documents: {}'.format(n_original_docs))
            print('X shape (#documents,#features): {}'.format(X.shape))
-            print('y prevalence: {:.2f}%'.format(y.mean()*100))
-            print('Epistola 1 shape:', EP1.shape)
-            print('Epistola 2 shape:', EP2.shape)
+            print('y prevalence: {:.2f}%'.format(y.mean() * 100))
            print()

-        return X, y, EP1, EP2
+        return X, y
+
+
+    def transform(self, test):
+        test = [test]
+
+        if self.split_documents:
+            test.extend(splitter(test, split_policy=self.split_policy))
+
+        # initialize the document-by-feature vector
+        TEST = np.empty((len(test), 0))
+
+        # dense feature extraction functions
+        if self.function_words_freq:
+            TEST = self._addfeatures(TEST, _features_function_words_freq(test, self.function_words_freq))
+            self._print('adding function words features: {} features'.format(TEST.shape[1]))
+
+        if self.features_Mendenhall:
+            TEST = self._addfeatures(TEST, _features_Mendenhall(test))
+            self._print('adding Mendenhall words features: {} features'.format(TEST.shape[1]))
+
+        # sparse feature extraction functions
+        if self.tfidf:
+            ep1_features, _ = _features_tfidf(test, self.tfidf_vectorizer)
+
+            if self.tfidf_feat_selection_ratio < 1.:
+                if self.verbose: print('feature selection')
+                ep1_features = self.feat_sel_tfidf.transform(ep1_features)
+
+            TEST = self._addfeatures(_tocsr(TEST), ep1_features)
+            self._print('adding tfidf words features: {} features'.format(TEST.shape[1]))
+
+        if self.ngrams:
+            ep1_features, _ = _features_ngrams(test, self.ns, ngrams_vectorizer=self.ngrams_vectorizer, min_df=5 * self.window_size)
+
+            if self.tfidf_feat_selection_ratio < 1.:
+                if self.verbose: print('feature selection')
+                ep1_features = self.feat_sel_ngrams.transform(ep1_features)
+
+            TEST = self._addfeatures(_tocsr(TEST), ep1_features)
+            self._print('adding ngrams words features: {} features'.format(TEST.shape[1]))
+
+        # print summary
+        if self.verbose:
+            print(
+                'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
+                .format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
+                        self.split_policy.__name__))
+            print('Epistola 1 shape:', TEST.shape)
+            print()
+
+        return TEST
+

    def _addfeatures(self, X, F):
        # plt.matshow(F[:25])
--- a/src/data/pan2015.py
+++ b/src/data/pan2015.py
@ -0,0 +1,51 @@
+import itertools
+import os
+from os.path import join, isdir
+
+PATH_PAN2015 = '../pan2015'
+PAN2015_TRAIN = 'pan15-authorship-verification-training-dataset-2015-04-19'
+PAN2015_TEST  = 'pan15-authorship-verification-test-dataset2-2015-04-19'
+
+class Pan2015:
+    def __init__(self, problem, solution):
+        self.problem = problem
+        self.solution = solution
+
+def fetch_PAN2015(corpus, lang, base_path = PATH_PAN2015):
+    assert corpus in ['train','test'],'unexpected corpus request'
+
+    corpus_path = join(base_path, PAN2015_TRAIN if corpus=='train' else PAN2015_TEST)
+
+    print(corpus_path)
+    request = {}
+    truth = {}
+    for dir in os.listdir(corpus_path):
+        dir_path = join(corpus_path,dir)
+        if isdir(dir_path) and lang in dir:
+            truth = [x.split() for x in open(join(dir_path,'truth.txt'), 'rt').readlines()]
+            truth = {problem:1 if decision == 'Y' else 0 for problem,decision in truth}
+            for problem_name in os.listdir(dir_path):
+                problem_dir = join(dir_path,problem_name)
+                if isdir(problem_dir):
+                    request[problem_name] = {}
+                    request[problem_name]['known'] = []
+                    for doc_name in os.listdir(problem_dir):
+                        doc_path = join(problem_dir,doc_name)
+                        if 'unknown.txt' == doc_name:
+                            request[problem_name]['unknown'] = open(doc_path,'rt').read()
+                        else:
+                            request[problem_name]['known'].append(open(doc_path, 'rt').read())
+
+    return Pan2015(request, truth)
+
+def TaskGenerator(request_dict):
+    pan_problems = request_dict.problem
+    problems = sorted(pan_problems.keys())
+    for i,problem_i in enumerate(problems):
+        positives = pan_problems[problem_i]['known']
+        negatives = list(itertools.chain.from_iterable([pan_problems[problem_j]['known'] for j,problem_j in enumerate(problems) if i!=j]))
+        test = pan_problems[problem_i]['unknown']
+        yield problem_i,positives,negatives,test,request_dict.solution[problem_i]
+
+
+
--- a/src/main.py
+++ b/src/main.py
@ -1,69 +0,0 @@
-import disable_sklearn_warnings
-from sklearn.svm import *
-from sklearn.model_selection import cross_val_score, GridSearchCV
-from sklearn.metrics import f1_score, make_scorer
-from verification import *
-
-# TODO: other split policies
-# TODO: understand normalization
-# TODO: wrap into an Estimator
-# TODO: check versions (numpy, scipy, sklearn)
-
-
-SVM = SVC
-# SVM = LinearSVC
-
-nfolds = 10
-params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
-if SVM is SVC:
-     params['kernel']=['linear','rbf']
-     probability = True
-else:
-    probability = False
-
-path = '../testi'
-
-reader = DocumentLoader(function_words_freq=True, features_Mendenhall=True,
-                       tfidf=True, tfidf_feat_selection_ratio=0.1,
-                       ngrams=True, ns=[3,4,5],
-                       split_documents=True, split_policy=split_by_sentences, normalize_features=True, window_size=1, verbose=True)
-
-Xtr,ytr,ep1,ep2 = reader.load_documents(path)
-
-# learn a SVM
-#svm = SVM(probability=probability)
-svm = SVM()
-
-positive_examples = ytr.sum()
-if positive_examples>nfolds:
-    print('optimizing {}'.format(svm.__class__.__name__))
-    svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
-
-svm.fit(Xtr, ytr)
-
-if isinstance(svm, GridSearchCV):
-    print('Best params: {}'.format(svm.best_params_))
-
-# evaluation of results
-print('computing the cross-val score')
-# f1scores = cross_val_score(svm, Xtr, ytr, cv=nfolds, n_jobs=-1, scoring=make_scorer(f1_score))
-f1scores = svm.best_score_
-f1_mean, f1_std = f1scores.mean(), f1scores.std()
-print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
-
-# final test
-def predictEpistola(ep, epistola_name):
-    pred = svm.predict(ep)
-    full_doc_prediction = pred[0]
-    print('{} is from Dante: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
-    if len(pred>0):
-        fragment_predictions= pred[1:]
-        print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
-        if SVM is SVC and probability:
-            prob = svm.predict_proba(ep)[:,1]
-            np.set_printoptions(precision=2, linewidth=200)
-            print('probabilistic view: full={:.3f}, fragments average {:.3f}, fragments={}'.format(prob[0], prob[1:].mean(), prob[1:]))
-
-print('Predicting the Epistolas')
-predictEpistola(ep1, 'Epistola 1')
-predictEpistola(ep2, 'Epistola 2')
--- a/src/model.py
+++ b/src/model.py
@ -0,0 +1,77 @@
+from sklearn.metrics import f1_score
+from sklearn.metrics import make_scorer
+from sklearn.model_selection import GridSearchCV
+
+from util import disable_sklearn_warnings
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import *
+from data.features import *
+
+class RandomVerificator:
+    def __init__(self): pass
+    def fit(self,positives,negatives):
+        pass
+    def predict(self,test):
+        return np.random.rand()
+
+class AuthorshipVerificator:
+
+    def __init__(self, nfolds=10,
+                 params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]},
+                 estimator=SVC):
+        self.nfolds = nfolds
+        self.params = params
+        if estimator is SVC:
+            self.params['kernel'] = ['linear', 'rbf']
+            self.probability = True
+            self.svm = estimator(probability=self.probability)
+        elif estimator is LinearSVC:
+            self.probability = False
+            self.svm = estimator()
+        elif estimator is LogisticRegression:
+            self.probability = True
+            self.svm = LogisticRegression()
+
+    def fit(self,X,y):
+        if not isinstance(y,np.ndarray): y=np.array(y)
+        positive_examples = y.sum()
+        if positive_examples >= self.nfolds:
+            print('optimizing {}'.format(self.svm.__class__.__name__))
+            self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
+        else:
+            self.estimator = self.svm
+
+        self.estimator.fit(X, y)
+
+        if isinstance(self.estimator, GridSearchCV):
+            print('Best params: {}'.format(self.estimator.best_params_))
+            print('computing the cross-val score')
+            f1scores = self.estimator.best_score_
+            f1_mean, f1_std = f1scores.mean(), f1scores.std()
+            print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
+
+        return self
+
+    def predict(self, test, epistola_name=''):
+        pred = self.estimator.predict(test)
+        full_doc_prediction = pred[0]
+        print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
+        if len(pred) > 1:
+            fragment_predictions = pred[1:]
+            print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
+            return full_doc_prediction, fragment_predictions
+        return full_doc_prediction
+
+    def predict_proba(self, test, epistola_name=''):
+        assert self.probability, 'svm is not calibrated'
+        pred = self.estimator.predict_proba(test)
+        full_doc_prediction = pred[0,1]
+        print('{} is from the same author: {}'.format(epistola_name, full_doc_prediction))
+        if len(pred) > 1:
+            fragment_predictions = pred[1:,1]
+            print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
+            return full_doc_prediction, fragment_predictions
+        return full_doc_prediction
+
+
+
--- a/src/pan2015_eval.py
+++ b/src/pan2015_eval.py
@ -0,0 +1,85 @@
+from joblib import Parallel
+from joblib import delayed
+from sklearn.linear_model import LogisticRegression
+from util import disable_sklearn_warnings
+from sklearn.svm import LinearSVC, SVC
+from data.features import FeatureExtractor
+from data.pan2015 import fetch_PAN2015, TaskGenerator
+from model import AuthorshipVerificator
+import numpy as np
+from sklearn.metrics import f1_score, roc_auc_score
+
+def evaluation(y_pred, y_prob, y_true):
+    y_pred_array = np.array(y_pred)
+    y_prob_array = np.array(y_prob)
+    y_true_array = np.array(y_true)
+
+    acc = (y_pred_array == y_true_array).mean()
+    f1 = f1_score(y_true_array, y_pred_array)
+    auc = roc_auc_score(y_true_array, y_prob_array)
+    pan_eval = acc * auc
+
+    print('Accuracy = {:.3f}'.format(acc))
+    print('F1 = {:.3f}'.format(f1))
+    print('AUC = {:.3f}'.format(auc))
+    print('Acc*AUC = {:.3f}'.format(pan_eval))
+    print('true:', y_true)
+    print('pred:', y_pred)
+
+    return pan_eval
+
+
+def doall(problem,pos,neg,test,truth):
+    print('[Start]{}'.format(problem))
+    feature_extractor = FeatureExtractor(function_words_freq=lang,
+                                         features_Mendenhall=True,
+                                         tfidf=False, tfidf_feat_selection_ratio=0.1,
+                                         ngrams=True, ns=[4, 5],
+                                         split_documents=False,
+                                         normalize_features=True,
+                                         verbose=True)
+
+    method = AuthorshipVerificator(nfolds=3, estimator=LogisticRegression)
+
+    X, y = feature_extractor.fit(pos, neg)
+    test = feature_extractor.transform(test)
+
+    method.fit(X, y)
+    prediction = method.predict(test)
+    if method.probability:
+        probability = method.predict_proba(test)
+    else:
+        probability = prediction
+
+    print('[End]{}'.format(problem))
+    return problem, probability, prediction, truth
+
+    # print('{}-->{:.3f} decision={}'.format(problem, probability, prediction))
+    # print('pred={} truth={}'.format(prediction, truth))
+    #
+    # y_prob.append(probability)
+    # y_pred.append(prediction)
+    # y_true.append(truth)
+    #
+    # acc_auc = evaluation(y_pred, y_prob, y_true)
+
+
+
+if __name__ == '__main__':
+    split = 'test'
+    lang = 'spanish'
+    request = fetch_PAN2015(split, lang=lang)
+
+    with open('results_ngrams.csv', 'wt') as fo:
+        outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request))
+        y_pred, y_prob, y_true = [], [], []
+        for problem, probability, prediction, truth in outcomes:
+            fo.write('{} {:.3f}\n'.format(problem, probability))
+            y_pred.append(prediction)
+            y_prob.append(probability)
+            y_true.append(truth)
+        acc_auc = evaluation(y_pred, y_prob, y_true)
+        print('ACC * AUC = {:.3f}'.format(acc_auc))
+
+
+    print('done')
--- a/src/util/disable_sklearn_warnings.py
+++ b/src/util/disable_sklearn_warnings.py