pan 2015
This commit is contained in:
parent
e35f6c2e71
commit
893cc31225
|
|
@ -0,0 +1,40 @@
|
|||
from sklearn.linear_model import LogisticRegression
|
||||
from data.dante_loader import load_texts
|
||||
from data.features import *
|
||||
from model import AuthorshipVerificator
|
||||
from sklearn.svm import LinearSVC, SVC
|
||||
|
||||
# DONE: ngrams should contain punctuation marks according to Sapkota et al. [39] in the PAN 2015 overview
|
||||
# (More recently, it was shown that character
|
||||
# n-grams corresponding to word affixes and including punctuation marks are the most
|
||||
# significant features in cross-topic authorship attribution [57].)
|
||||
# TODO: split policies: understand overlapping in cross-validation
|
||||
|
||||
|
||||
|
||||
path = '../testi'
|
||||
|
||||
positive, negative, ep1_text, ep2_text = load_texts(path)
|
||||
|
||||
feature_extractor = FeatureExtractor(function_words_freq='latin', features_Mendenhall=True,
|
||||
tfidf=False, tfidf_feat_selection_ratio=0.1,
|
||||
ngrams=True, ns=[3,4,5],
|
||||
split_documents=True,
|
||||
split_policy=split_by_sentences,
|
||||
window_size=3,
|
||||
normalize_features=True, verbose=True)
|
||||
|
||||
Xtr,ytr = feature_extractor.fit(positive, negative)
|
||||
ep1 = feature_extractor.transform(ep1_text)
|
||||
ep2 = feature_extractor.transform(ep2_text)
|
||||
|
||||
print('Fitting the Verificator')
|
||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
|
||||
av.fit(Xtr,ytr)
|
||||
|
||||
print('Predicting the Epistolas')
|
||||
av.predict(ep1, 'Epistola 1')
|
||||
av.predict_proba(ep1, 'Epistola 1')
|
||||
|
||||
av.predict(ep2, 'Epistola 2')
|
||||
av.predict_proba(ep2, 'Epistola 2')
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
import os
|
||||
from os.path import join
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# document loading routine
|
||||
# ------------------------------------------------------------------------
|
||||
def load_texts(path, positive_author='Dante'):
|
||||
# load the training data (all documents but Epistolas 1 and 2)
|
||||
positive,negative = [],[]
|
||||
authors = []
|
||||
ndocs=0
|
||||
for file in os.listdir(path):
|
||||
if file.startswith('EpistolaXIII_'): continue
|
||||
file_clean = file.replace('.txt','')
|
||||
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
|
||||
text = open(join(path,file), encoding= "utf8").read()
|
||||
|
||||
if author == positive_author:
|
||||
positive.append(text)
|
||||
else:
|
||||
negative.append(text)
|
||||
authors.append(author)
|
||||
ndocs+=1
|
||||
|
||||
# load the test data (Epistolas 1 and 2)
|
||||
ep1_text = open(join(path, 'EpistolaXIII_1.txt'), encoding="utf8").read()
|
||||
ep2_text = open(join(path, 'EpistolaXIII_2.txt'), encoding="utf8").read()
|
||||
|
||||
return positive, negative, ep1_text, ep2_text
|
||||
|
|
@ -1,56 +1,31 @@
|
|||
import nltk
|
||||
import re
|
||||
import numpy as np
|
||||
import os
|
||||
from os.path import join
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.feature_selection import SelectKBest
|
||||
from sklearn.feature_selection import chi2
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.metrics import make_scorer
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn.preprocessing import normalize
|
||||
from scipy.sparse import hstack, csr_matrix, issparse
|
||||
from collections import Counter
|
||||
from nltk.corpus import stopwords
|
||||
|
||||
|
||||
function_words = ['et', 'in', 'de', 'ad', 'ut', 'cum', 'non', 'per', 'a', 'que', 'ex','sed',
|
||||
'quia', 'nam', 'sic', 'si', 'ab', 'etiam', 'idest', 'nec', 'vel', 'atque',
|
||||
'scilicet', 'sicut', 'hec', 'vero', 'tamen', 'dum', 'propter', 'pro', 'enim',
|
||||
'ita', 'autem', 'inter', 'unde', 'sub', 'tam', 'ibi', 'ideo', 'ergo', 'post',
|
||||
'iam', 'seu', 'inde', 'tantum', 'sive', 'quomodo', 'ubi', 'ac', 'ob', 'igitur',
|
||||
'tunc', 'nisi', 'quasi', 'quantum', 'aut', 'usque', 'bene', 'ne', 'ante',
|
||||
'nunc', 'magis', 'sine', 'circa', 'apud', 'contra', 'adhuc', 'satis', 'semper',
|
||||
'super', 'adeo', 'tandem', 'tanquam', 'quoniam', 'quin', 'quemadmodum', 'supra']
|
||||
|
||||
nfolds = 5
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# document loading routine
|
||||
# ------------------------------------------------------------------------
|
||||
def _load_texts(path):
|
||||
# load the training data (all documents but Epistolas 1 and 2)
|
||||
documents = []
|
||||
authors = []
|
||||
ndocs=0
|
||||
for file in os.listdir(path):
|
||||
if file.startswith('EpistolaXIII_'): continue
|
||||
file_clean = file.replace('.txt','')
|
||||
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
|
||||
text = open(join(path,file), encoding= "utf8").read()
|
||||
|
||||
documents.append(text)
|
||||
authors.append(author)
|
||||
ndocs+=1
|
||||
|
||||
# load the test data (Epistolas 1 and 2)
|
||||
ep1_text = open(join(path, 'EpistolaXIII_1.txt'), encoding="utf8").read()
|
||||
ep2_text = open(join(path, 'EpistolaXIII_2.txt'), encoding="utf8").read()
|
||||
|
||||
return documents, authors, ep1_text, ep2_text
|
||||
latin_function_words = ['et', 'in', 'de', 'ad', 'ut', 'cum', 'non', 'per', 'a', 'que', 'ex','sed',
|
||||
'quia', 'nam', 'sic', 'si', 'ab', 'etiam', 'idest', 'nec', 'vel', 'atque',
|
||||
'scilicet', 'sicut', 'hec', 'vero', 'tamen', 'dum', 'propter', 'pro', 'enim',
|
||||
'ita', 'autem', 'inter', 'unde', 'sub', 'tam', 'ibi', 'ideo', 'ergo', 'post',
|
||||
'iam', 'seu', 'inde', 'tantum', 'sive', 'quomodo', 'ubi', 'ac', 'ob', 'igitur',
|
||||
'tunc', 'nisi', 'quasi', 'quantum', 'aut', 'usque', 'bene', 'ne', 'ante',
|
||||
'nunc', 'magis', 'sine', 'circa', 'apud', 'contra', 'adhuc', 'satis', 'semper',
|
||||
'super', 'adeo', 'tandem', 'tanquam', 'quoniam', 'quin', 'quemadmodum', 'supra']
|
||||
|
||||
def get_function_words(lang):
|
||||
if lang=='latin':
|
||||
return latin_function_words
|
||||
elif lang in ['english','spanish']:
|
||||
return stopwords.words(lang)
|
||||
else:
|
||||
raise ValueError('{} not in scope!'.format(lang))
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# split policies
|
||||
|
|
@ -78,8 +53,13 @@ def split_by_sentences(text):
|
|||
|
||||
def windows(text_fragments, window_size):
|
||||
new_fragments = []
|
||||
for i in range(len(text_fragments)-window_size+1):
|
||||
new_fragments.append(' '.join(text_fragments[i:i+window_size]))
|
||||
nbatches = len(text_fragments) // window_size
|
||||
if len(text_fragments) % window_size > 0:
|
||||
nbatches+=1
|
||||
# for i in range(len(text_fragments)-window_size+1):
|
||||
for i in range(nbatches):
|
||||
offset = i*window_size
|
||||
new_fragments.append(' '.join(text_fragments[offset:offset+window_size]))
|
||||
return new_fragments
|
||||
|
||||
def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1):
|
||||
|
|
@ -100,14 +80,14 @@ def splitter(documents, authors=None, split_policy=split_by_sentences, window_si
|
|||
# ------------------------------------------------------------------------
|
||||
# feature extraction methods
|
||||
# ------------------------------------------------------------------------
|
||||
# TODO: implement other feature extraction methods
|
||||
def _features_function_words_freq(documents):
|
||||
def _features_function_words_freq(documents, lang):
|
||||
"""
|
||||
Extract features as the frequency (x1000) of the function words used in the documents
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
|
||||
"""
|
||||
features = []
|
||||
function_words = get_function_words(lang)
|
||||
|
||||
for text in documents:
|
||||
unmod_tokens = nltk.word_tokenize(text)
|
||||
|
|
@ -160,9 +140,9 @@ def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1):
|
|||
return features, tfidf_vectorizer
|
||||
|
||||
|
||||
def _features_ngrams(documents, ns=[4, 5], tfidf_vectorizer=None, min_df = 5):
|
||||
def _features_ngrams(documents, ns=[4, 5], ngrams_vectorizer=None, min_df = 5):
|
||||
doc_ngrams = ngrams_extractor(documents, ns)
|
||||
return _features_tfidf(doc_ngrams, tfidf_vectorizer=tfidf_vectorizer, min_df = min_df)
|
||||
return _features_tfidf(doc_ngrams, tfidf_vectorizer=ngrams_vectorizer, min_df = min_df)
|
||||
|
||||
|
||||
def ngrams_extractor(documents, ns=[4, 5]):
|
||||
|
|
@ -171,7 +151,7 @@ def ngrams_extractor(documents, ns=[4, 5]):
|
|||
|
||||
list_ngrams = []
|
||||
for doc in documents:
|
||||
doc = re.sub(r'[^\w\s]','', doc.strip())
|
||||
# doc = re.sub(r'[^\w\s]','', doc.strip())
|
||||
doc_ngrams = []
|
||||
for ni in ns:
|
||||
doc_ngrams.extend([doc[i:i + ni].replace(' ','_') for i in range(len(doc) - ni + 1)])
|
||||
|
|
@ -181,23 +161,21 @@ def ngrams_extractor(documents, ns=[4, 5]):
|
|||
return list_ngrams
|
||||
|
||||
|
||||
def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
|
||||
def _feature_selection(X, y, tfidf_feat_selection_ratio):
|
||||
nF = X.shape[1]
|
||||
num_feats = int(tfidf_feat_selection_ratio * nF)
|
||||
feature_selector = SelectKBest(chi2, k=num_feats)
|
||||
X = feature_selector.fit_transform(X, y)
|
||||
EP1 = feature_selector.transform(EP1)
|
||||
EP2 = feature_selector.transform(EP2)
|
||||
return X,EP1,EP2
|
||||
|
||||
return X, feature_selector
|
||||
|
||||
def _tocsr(X):
|
||||
return X if issparse(X) else csr_matrix(X)
|
||||
|
||||
class DocumentLoader:
|
||||
|
||||
class FeatureExtractor:
|
||||
|
||||
def __init__(self,
|
||||
function_words_freq=True,
|
||||
function_words_freq=None,
|
||||
features_Mendenhall=True,
|
||||
tfidf=False,
|
||||
tfidf_feat_selection_ratio=1.,
|
||||
|
|
@ -240,87 +218,123 @@ class DocumentLoader:
|
|||
self.verbose = verbose
|
||||
|
||||
|
||||
def load_documents(self, path):
|
||||
documents, authors, ep1_text, ep2_text = _load_texts(path)
|
||||
ep1,ep2 = [ep1_text],[ep2_text]
|
||||
n_original_docs=len(documents)
|
||||
def fit(self, positives, negatives):
|
||||
documents = positives + negatives
|
||||
authors = [1]*len(positives) + [0]*len(negatives)
|
||||
n_original_docs = len(documents)
|
||||
|
||||
if self.split_documents:
|
||||
doc_fragments, authors_fragments = splitter(documents, authors, split_policy=self.split_policy, window_size=self.window_size)
|
||||
doc_fragments, authors_fragments = splitter(documents, authors,
|
||||
split_policy=self.split_policy,
|
||||
window_size=self.window_size)
|
||||
documents.extend(doc_fragments)
|
||||
authors.extend(authors_fragments)
|
||||
|
||||
ep1.extend(splitter(ep1, split_policy=self.split_policy))
|
||||
ep2.extend(splitter(ep2, split_policy=self.split_policy))
|
||||
self._print('splitting documents: {} documents'.format(len(doc_fragments)))
|
||||
|
||||
# represent the target vector
|
||||
y = np.array([(1 if author == "Dante" else 0) for author in authors])
|
||||
y = np.array(authors)
|
||||
|
||||
# initialize the document-by-feature vector
|
||||
X = np.empty((len(documents), 0))
|
||||
EP1 = np.empty((len(ep1), 0))
|
||||
EP2 = np.empty((len(ep2), 0))
|
||||
|
||||
# dense feature extraction functions
|
||||
if self.function_words_freq:
|
||||
X = self._addfeatures(X, _features_function_words_freq(documents))
|
||||
EP1 = self._addfeatures(EP1, _features_function_words_freq(ep1))
|
||||
EP2 = self._addfeatures(EP2, _features_function_words_freq(ep2))
|
||||
X = self._addfeatures(X, _features_function_words_freq(documents, self.function_words_freq))
|
||||
self._print('adding function words features: {} features'.format(X.shape[1]))
|
||||
|
||||
if self.features_Mendenhall:
|
||||
X = self._addfeatures(X, _features_Mendenhall(documents))
|
||||
EP1 = self._addfeatures(EP1, _features_Mendenhall(ep1))
|
||||
EP2 = self._addfeatures(EP2, _features_Mendenhall(ep2))
|
||||
self._print('adding Mendenhall words features: {} features'.format(X.shape[1]))
|
||||
|
||||
|
||||
# sparse feature extraction functions
|
||||
if self.tfidf:
|
||||
X_features, vectorizer = _features_tfidf(documents)
|
||||
ep1_features, _ = _features_tfidf(ep1, vectorizer)
|
||||
ep2_features, _ = _features_tfidf(ep2, vectorizer)
|
||||
self.tfidf_vectorizer = vectorizer
|
||||
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
X_features, ep1_features, ep2_features = \
|
||||
_feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
|
||||
X_features, feat_sel = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
|
||||
self.feat_sel_tfidf = feat_sel
|
||||
|
||||
X = self._addfeatures(_tocsr(X), X_features)
|
||||
EP1 = self._addfeatures(_tocsr(EP1), ep1_features)
|
||||
EP2 = self._addfeatures(_tocsr(EP2), ep2_features)
|
||||
X = self._addfeatures(_tocsr(X), X_features)
|
||||
self._print('adding tfidf words features: {} features'.format(X.shape[1]))
|
||||
|
||||
if self.ngrams:
|
||||
X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5*self.window_size)
|
||||
ep1_features, _ = _features_ngrams(ep1, self.ns, tfidf_vectorizer=vectorizer, min_df=5*self.window_size)
|
||||
ep2_features, _ = _features_ngrams(ep2, self.ns, tfidf_vectorizer=vectorizer, min_df=5*self.window_size)
|
||||
X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5 * self.window_size)
|
||||
self.ngrams_vectorizer = vectorizer
|
||||
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
X_features, ep1_features, ep2_features = \
|
||||
_feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
|
||||
X_features, feat_sel = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
|
||||
self.feat_sel_ngrams = feat_sel
|
||||
|
||||
X = self._addfeatures(_tocsr(X), X_features)
|
||||
EP1 = self._addfeatures(_tocsr(EP1), ep1_features)
|
||||
EP2 = self._addfeatures(_tocsr(EP2), ep2_features)
|
||||
X = self._addfeatures(_tocsr(X), X_features)
|
||||
self._print('adding ngrams words features: {} features'.format(X.shape[1]))
|
||||
|
||||
|
||||
# print summary
|
||||
if self.verbose:
|
||||
print('load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
|
||||
.format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
|
||||
self.split_policy.__name__))
|
||||
print(
|
||||
'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
|
||||
.format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
|
||||
self.split_policy.__name__))
|
||||
print('number of training (full) documents: {}'.format(n_original_docs))
|
||||
print('X shape (#documents,#features): {}'.format(X.shape))
|
||||
print('y prevalence: {:.2f}%'.format(y.mean()*100))
|
||||
print('Epistola 1 shape:', EP1.shape)
|
||||
print('Epistola 2 shape:', EP2.shape)
|
||||
print('y prevalence: {:.2f}%'.format(y.mean() * 100))
|
||||
print()
|
||||
|
||||
return X, y, EP1, EP2
|
||||
return X, y
|
||||
|
||||
|
||||
def transform(self, test):
|
||||
test = [test]
|
||||
|
||||
if self.split_documents:
|
||||
test.extend(splitter(test, split_policy=self.split_policy))
|
||||
|
||||
# initialize the document-by-feature vector
|
||||
TEST = np.empty((len(test), 0))
|
||||
|
||||
# dense feature extraction functions
|
||||
if self.function_words_freq:
|
||||
TEST = self._addfeatures(TEST, _features_function_words_freq(test, self.function_words_freq))
|
||||
self._print('adding function words features: {} features'.format(TEST.shape[1]))
|
||||
|
||||
if self.features_Mendenhall:
|
||||
TEST = self._addfeatures(TEST, _features_Mendenhall(test))
|
||||
self._print('adding Mendenhall words features: {} features'.format(TEST.shape[1]))
|
||||
|
||||
# sparse feature extraction functions
|
||||
if self.tfidf:
|
||||
ep1_features, _ = _features_tfidf(test, self.tfidf_vectorizer)
|
||||
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
ep1_features = self.feat_sel_tfidf.transform(ep1_features)
|
||||
|
||||
TEST = self._addfeatures(_tocsr(TEST), ep1_features)
|
||||
self._print('adding tfidf words features: {} features'.format(TEST.shape[1]))
|
||||
|
||||
if self.ngrams:
|
||||
ep1_features, _ = _features_ngrams(test, self.ns, ngrams_vectorizer=self.ngrams_vectorizer, min_df=5 * self.window_size)
|
||||
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
ep1_features = self.feat_sel_ngrams.transform(ep1_features)
|
||||
|
||||
TEST = self._addfeatures(_tocsr(TEST), ep1_features)
|
||||
self._print('adding ngrams words features: {} features'.format(TEST.shape[1]))
|
||||
|
||||
# print summary
|
||||
if self.verbose:
|
||||
print(
|
||||
'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
|
||||
.format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
|
||||
self.split_policy.__name__))
|
||||
print('Epistola 1 shape:', TEST.shape)
|
||||
print()
|
||||
|
||||
return TEST
|
||||
|
||||
|
||||
def _addfeatures(self, X, F):
|
||||
# plt.matshow(F[:25])
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
import itertools
|
||||
import os
|
||||
from os.path import join, isdir
|
||||
|
||||
PATH_PAN2015 = '../pan2015'
|
||||
PAN2015_TRAIN = 'pan15-authorship-verification-training-dataset-2015-04-19'
|
||||
PAN2015_TEST = 'pan15-authorship-verification-test-dataset2-2015-04-19'
|
||||
|
||||
class Pan2015:
|
||||
def __init__(self, problem, solution):
|
||||
self.problem = problem
|
||||
self.solution = solution
|
||||
|
||||
def fetch_PAN2015(corpus, lang, base_path = PATH_PAN2015):
|
||||
assert corpus in ['train','test'],'unexpected corpus request'
|
||||
|
||||
corpus_path = join(base_path, PAN2015_TRAIN if corpus=='train' else PAN2015_TEST)
|
||||
|
||||
print(corpus_path)
|
||||
request = {}
|
||||
truth = {}
|
||||
for dir in os.listdir(corpus_path):
|
||||
dir_path = join(corpus_path,dir)
|
||||
if isdir(dir_path) and lang in dir:
|
||||
truth = [x.split() for x in open(join(dir_path,'truth.txt'), 'rt').readlines()]
|
||||
truth = {problem:1 if decision == 'Y' else 0 for problem,decision in truth}
|
||||
for problem_name in os.listdir(dir_path):
|
||||
problem_dir = join(dir_path,problem_name)
|
||||
if isdir(problem_dir):
|
||||
request[problem_name] = {}
|
||||
request[problem_name]['known'] = []
|
||||
for doc_name in os.listdir(problem_dir):
|
||||
doc_path = join(problem_dir,doc_name)
|
||||
if 'unknown.txt' == doc_name:
|
||||
request[problem_name]['unknown'] = open(doc_path,'rt').read()
|
||||
else:
|
||||
request[problem_name]['known'].append(open(doc_path, 'rt').read())
|
||||
|
||||
return Pan2015(request, truth)
|
||||
|
||||
def TaskGenerator(request_dict):
|
||||
pan_problems = request_dict.problem
|
||||
problems = sorted(pan_problems.keys())
|
||||
for i,problem_i in enumerate(problems):
|
||||
positives = pan_problems[problem_i]['known']
|
||||
negatives = list(itertools.chain.from_iterable([pan_problems[problem_j]['known'] for j,problem_j in enumerate(problems) if i!=j]))
|
||||
test = pan_problems[problem_i]['unknown']
|
||||
yield problem_i,positives,negatives,test,request_dict.solution[problem_i]
|
||||
|
||||
|
||||
|
||||
69
src/main.py
69
src/main.py
|
|
@ -1,69 +0,0 @@
|
|||
import disable_sklearn_warnings
|
||||
from sklearn.svm import *
|
||||
from sklearn.model_selection import cross_val_score, GridSearchCV
|
||||
from sklearn.metrics import f1_score, make_scorer
|
||||
from verification import *
|
||||
|
||||
# TODO: other split policies
|
||||
# TODO: understand normalization
|
||||
# TODO: wrap into an Estimator
|
||||
# TODO: check versions (numpy, scipy, sklearn)
|
||||
|
||||
|
||||
SVM = SVC
|
||||
# SVM = LinearSVC
|
||||
|
||||
nfolds = 10
|
||||
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
|
||||
if SVM is SVC:
|
||||
params['kernel']=['linear','rbf']
|
||||
probability = True
|
||||
else:
|
||||
probability = False
|
||||
|
||||
path = '../testi'
|
||||
|
||||
reader = DocumentLoader(function_words_freq=True, features_Mendenhall=True,
|
||||
tfidf=True, tfidf_feat_selection_ratio=0.1,
|
||||
ngrams=True, ns=[3,4,5],
|
||||
split_documents=True, split_policy=split_by_sentences, normalize_features=True, window_size=1, verbose=True)
|
||||
|
||||
Xtr,ytr,ep1,ep2 = reader.load_documents(path)
|
||||
|
||||
# learn a SVM
|
||||
#svm = SVM(probability=probability)
|
||||
svm = SVM()
|
||||
|
||||
positive_examples = ytr.sum()
|
||||
if positive_examples>nfolds:
|
||||
print('optimizing {}'.format(svm.__class__.__name__))
|
||||
svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
|
||||
|
||||
svm.fit(Xtr, ytr)
|
||||
|
||||
if isinstance(svm, GridSearchCV):
|
||||
print('Best params: {}'.format(svm.best_params_))
|
||||
|
||||
# evaluation of results
|
||||
print('computing the cross-val score')
|
||||
# f1scores = cross_val_score(svm, Xtr, ytr, cv=nfolds, n_jobs=-1, scoring=make_scorer(f1_score))
|
||||
f1scores = svm.best_score_
|
||||
f1_mean, f1_std = f1scores.mean(), f1scores.std()
|
||||
print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
|
||||
|
||||
# final test
|
||||
def predictEpistola(ep, epistola_name):
|
||||
pred = svm.predict(ep)
|
||||
full_doc_prediction = pred[0]
|
||||
print('{} is from Dante: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
|
||||
if len(pred>0):
|
||||
fragment_predictions= pred[1:]
|
||||
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
||||
if SVM is SVC and probability:
|
||||
prob = svm.predict_proba(ep)[:,1]
|
||||
np.set_printoptions(precision=2, linewidth=200)
|
||||
print('probabilistic view: full={:.3f}, fragments average {:.3f}, fragments={}'.format(prob[0], prob[1:].mean(), prob[1:]))
|
||||
|
||||
print('Predicting the Epistolas')
|
||||
predictEpistola(ep1, 'Epistola 1')
|
||||
predictEpistola(ep2, 'Epistola 2')
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
from sklearn.metrics import f1_score
|
||||
from sklearn.metrics import make_scorer
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
from util import disable_sklearn_warnings
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.svm import *
|
||||
from data.features import *
|
||||
|
||||
class RandomVerificator:
|
||||
def __init__(self): pass
|
||||
def fit(self,positives,negatives):
|
||||
pass
|
||||
def predict(self,test):
|
||||
return np.random.rand()
|
||||
|
||||
class AuthorshipVerificator:
|
||||
|
||||
def __init__(self, nfolds=10,
|
||||
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]},
|
||||
estimator=SVC):
|
||||
self.nfolds = nfolds
|
||||
self.params = params
|
||||
if estimator is SVC:
|
||||
self.params['kernel'] = ['linear', 'rbf']
|
||||
self.probability = True
|
||||
self.svm = estimator(probability=self.probability)
|
||||
elif estimator is LinearSVC:
|
||||
self.probability = False
|
||||
self.svm = estimator()
|
||||
elif estimator is LogisticRegression:
|
||||
self.probability = True
|
||||
self.svm = LogisticRegression()
|
||||
|
||||
def fit(self,X,y):
|
||||
if not isinstance(y,np.ndarray): y=np.array(y)
|
||||
positive_examples = y.sum()
|
||||
if positive_examples >= self.nfolds:
|
||||
print('optimizing {}'.format(self.svm.__class__.__name__))
|
||||
self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
|
||||
else:
|
||||
self.estimator = self.svm
|
||||
|
||||
self.estimator.fit(X, y)
|
||||
|
||||
if isinstance(self.estimator, GridSearchCV):
|
||||
print('Best params: {}'.format(self.estimator.best_params_))
|
||||
print('computing the cross-val score')
|
||||
f1scores = self.estimator.best_score_
|
||||
f1_mean, f1_std = f1scores.mean(), f1scores.std()
|
||||
print('F1-measure={:.3f} (+-{:.3f})\n'.format(f1_mean, f1_std))
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, test, epistola_name=''):
|
||||
pred = self.estimator.predict(test)
|
||||
full_doc_prediction = pred[0]
|
||||
print('{} is from the same author: {}'.format(epistola_name, 'Yes' if full_doc_prediction == 1 else 'No'))
|
||||
if len(pred) > 1:
|
||||
fragment_predictions = pred[1:]
|
||||
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
||||
return full_doc_prediction, fragment_predictions
|
||||
return full_doc_prediction
|
||||
|
||||
def predict_proba(self, test, epistola_name=''):
|
||||
assert self.probability, 'svm is not calibrated'
|
||||
pred = self.estimator.predict_proba(test)
|
||||
full_doc_prediction = pred[0,1]
|
||||
print('{} is from the same author: {}'.format(epistola_name, full_doc_prediction))
|
||||
if len(pred) > 1:
|
||||
fragment_predictions = pred[1:,1]
|
||||
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
||||
return full_doc_prediction, fragment_predictions
|
||||
return full_doc_prediction
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
from joblib import Parallel
|
||||
from joblib import delayed
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from util import disable_sklearn_warnings
|
||||
from sklearn.svm import LinearSVC, SVC
|
||||
from data.features import FeatureExtractor
|
||||
from data.pan2015 import fetch_PAN2015, TaskGenerator
|
||||
from model import AuthorshipVerificator
|
||||
import numpy as np
|
||||
from sklearn.metrics import f1_score, roc_auc_score
|
||||
|
||||
def evaluation(y_pred, y_prob, y_true):
|
||||
y_pred_array = np.array(y_pred)
|
||||
y_prob_array = np.array(y_prob)
|
||||
y_true_array = np.array(y_true)
|
||||
|
||||
acc = (y_pred_array == y_true_array).mean()
|
||||
f1 = f1_score(y_true_array, y_pred_array)
|
||||
auc = roc_auc_score(y_true_array, y_prob_array)
|
||||
pan_eval = acc * auc
|
||||
|
||||
print('Accuracy = {:.3f}'.format(acc))
|
||||
print('F1 = {:.3f}'.format(f1))
|
||||
print('AUC = {:.3f}'.format(auc))
|
||||
print('Acc*AUC = {:.3f}'.format(pan_eval))
|
||||
print('true:', y_true)
|
||||
print('pred:', y_pred)
|
||||
|
||||
return pan_eval
|
||||
|
||||
|
||||
def doall(problem,pos,neg,test,truth):
|
||||
print('[Start]{}'.format(problem))
|
||||
feature_extractor = FeatureExtractor(function_words_freq=lang,
|
||||
features_Mendenhall=True,
|
||||
tfidf=False, tfidf_feat_selection_ratio=0.1,
|
||||
ngrams=True, ns=[4, 5],
|
||||
split_documents=False,
|
||||
normalize_features=True,
|
||||
verbose=True)
|
||||
|
||||
method = AuthorshipVerificator(nfolds=3, estimator=LogisticRegression)
|
||||
|
||||
X, y = feature_extractor.fit(pos, neg)
|
||||
test = feature_extractor.transform(test)
|
||||
|
||||
method.fit(X, y)
|
||||
prediction = method.predict(test)
|
||||
if method.probability:
|
||||
probability = method.predict_proba(test)
|
||||
else:
|
||||
probability = prediction
|
||||
|
||||
print('[End]{}'.format(problem))
|
||||
return problem, probability, prediction, truth
|
||||
|
||||
# print('{}-->{:.3f} decision={}'.format(problem, probability, prediction))
|
||||
# print('pred={} truth={}'.format(prediction, truth))
|
||||
#
|
||||
# y_prob.append(probability)
|
||||
# y_pred.append(prediction)
|
||||
# y_true.append(truth)
|
||||
#
|
||||
# acc_auc = evaluation(y_pred, y_prob, y_true)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
split = 'test'
|
||||
lang = 'spanish'
|
||||
request = fetch_PAN2015(split, lang=lang)
|
||||
|
||||
with open('results_ngrams.csv', 'wt') as fo:
|
||||
outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request))
|
||||
y_pred, y_prob, y_true = [], [], []
|
||||
for problem, probability, prediction, truth in outcomes:
|
||||
fo.write('{} {:.3f}\n'.format(problem, probability))
|
||||
y_pred.append(prediction)
|
||||
y_prob.append(probability)
|
||||
y_true.append(truth)
|
||||
acc_auc = evaluation(y_pred, y_prob, y_true)
|
||||
print('ACC * AUC = {:.3f}'.format(acc_auc))
|
||||
|
||||
|
||||
print('done')
|
||||
Loading…
Reference in New Issue