LOO validation with document-groups and gridsearch with StratifiedKFold

This commit is contained in:
Alejandro Moreo Fernandez 2019-01-07 18:43:39 +01:00
parent b2ab5556e1
commit 9a0ad7dd3d
3 changed files with 60 additions and 20 deletions

View File

@ -12,7 +12,7 @@ from util.color_visualization import color
# TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection # TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection
# TODO: sentence length (Mendenhall-style) ? # TODO: sentence length (Mendenhall-style) ?
for epistola in [1,2]: for epistola in [2]:
print('Epistola {}'.format(epistola)) print('Epistola {}'.format(epistola))
print('='*80) print('='*80)
path = '../testi_{}'.format(epistola) path = '../testi_{}'.format(epistola)
@ -20,6 +20,7 @@ for epistola in [1,2]:
path+='_with_GuidoDaPisa' path+='_with_GuidoDaPisa'
positive, negative, ep_text = load_texts(path, unknown_target='EpistolaXIII_{}.txt'.format(epistola)) positive, negative, ep_text = load_texts(path, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
n_full_docs = len(positive) + len(negative)
feature_extractor = FeatureExtractor(function_words_freq='latin', feature_extractor = FeatureExtractor(function_words_freq='latin',
conjugations_freq='latin', conjugations_freq='latin',
@ -27,15 +28,17 @@ for epistola in [1,2]:
tfidf_feat_selection_ratio=0.1, tfidf_feat_selection_ratio=0.1,
wordngrams=False, n_wordngrams=(1, 2), wordngrams=False, n_wordngrams=(1, 2),
charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False, charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False,
split_documents=False, split_policy=split_by_sentences, window_size=3, split_documents=True, split_policy=split_by_sentences, window_size=3,
normalize_features=True) normalize_features=True)
Xtr,ytr = feature_extractor.fit_transform(positive, negative) Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
print(ytr)
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3) ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
print('Fitting the Verificator') print('Fitting the Verificator')
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression) av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
av.fit(Xtr,ytr) av.fit(Xtr,ytr,groups)
print('Predicting the Epistola {}'.format(epistola)) print('Predicting the Epistola {}'.format(epistola))
title = 'Epistola {}'.format('I' if epistola==1 else 'II') title = 'Epistola {}'.format('I' if epistola==1 else 'II')
@ -43,8 +46,12 @@ for epistola in [1,2]:
fulldoc_prob, fragment_probs = av.predict_proba(ep, title) fulldoc_prob, fragment_probs = av.predict_proba(ep, title)
# color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title) # color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title)
param = 'All' score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False)
# with open('features{}.csv'.format(epistola), 'at') as fo: print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std))
# validation=av.estimator.best_score_.mean()
# nfeatures = Xtr.shape[1] score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True)
# fo.write('{}\t{}\t{:.0f}\t{:.3f}\t{:.3f}\n'.format(param, value, nfeatures, validation, fulldoc_prob)) print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
score_ave, score_std = av.leave_one_out(Xtr, ytr, None)
print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std))

View File

@ -49,6 +49,10 @@ latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amus
'sim', 'sis', 'sit', 'simus', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemus', 'essetis', 'essent', 'sim', 'sis', 'sit', 'simus', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemus', 'essetis', 'essent',
'fui', 'fuisti', 'fuit', 'fuimus', 'fuistis', 'fuerunt', 'este', 'esto', 'estote', 'sunto'] 'fui', 'fuisti', 'fuit', 'fuimus', 'fuistis', 'fuerunt', 'este', 'esto', 'estote', 'sunto']
spanish_conjugations = ['o','as','a','amos','áis','an','es','e','emos','éis','en','imos','ís','guir','ger','gir',
'ar','er','ir','é','aste','ó','asteis','aron','í','iste','','isteis','ieron',
'aba', 'abas', 'ábamos', 'aban', 'ía', 'ías', 'íamos', 'íais', 'ían', 'ás','á',
'án','estoy','estás','está','estamos','estáis','están']
def get_conjugations(lang): def get_conjugations(lang):
if lang == 'latin': if lang == 'latin':
@ -95,17 +99,19 @@ def windows(text_fragments, window_size):
def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1): def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1):
fragments = [] fragments = []
authors_fragments = [] authors_fragments = []
groups = []
for i, text in enumerate(documents): for i, text in enumerate(documents):
text_fragments = split_policy(text) text_fragments = split_policy(text)
text_fragments = windows(text_fragments, window_size=window_size) text_fragments = windows(text_fragments, window_size=window_size)
fragments.extend(text_fragments) fragments.extend(text_fragments)
groups.extend([i]*len(text_fragments))
if authors is not None: if authors is not None:
authors_fragments.extend([authors[i]] * len(text_fragments)) authors_fragments.extend([authors[i]] * len(text_fragments))
if authors is not None: if authors is not None:
return fragments, authors_fragments return fragments, authors_fragments, groups
return fragments return fragments, groups
def tokenize(text): def tokenize(text):
@ -280,17 +286,20 @@ class FeatureExtractor:
documents = positives + negatives documents = positives + negatives
authors = [1]*len(positives) + [0]*len(negatives) authors = [1]*len(positives) + [0]*len(negatives)
n_original_docs = len(documents) n_original_docs = len(documents)
groups = list(range(n_original_docs))
if self.split_documents: if self.split_documents:
doc_fragments, authors_fragments = splitter(documents, authors, doc_fragments, authors_fragments, groups_fragments = splitter(documents, authors,
split_policy=self.split_policy, split_policy=self.split_policy,
window_size=self.window_size) window_size=self.window_size)
documents.extend(doc_fragments) documents.extend(doc_fragments)
authors.extend(authors_fragments) authors.extend(authors_fragments)
groups.extend(groups_fragments)
self._print('splitting documents: {} documents'.format(len(doc_fragments))) self._print('splitting documents: {} documents'.format(len(doc_fragments)))
# represent the target vector # represent the target vector
y = np.array(authors) y = np.array(authors)
groups = np.array(groups)
# initialize the document-by-feature vector # initialize the document-by-feature vector
X = np.empty((len(documents), 0)) X = np.empty((len(documents), 0))
@ -345,7 +354,7 @@ class FeatureExtractor:
print('y prevalence: {}/{} {:.2f}%'.format(y.sum(),len(y),y.mean() * 100)) print('y prevalence: {}/{} {:.2f}%'.format(y.sum(),len(y),y.mean() * 100))
print() print()
return X, y return X, y, groups
def transform(self, test, return_fragments=False, window_size=-1): def transform(self, test, return_fragments=False, window_size=-1):
@ -354,7 +363,8 @@ class FeatureExtractor:
window_size = self.window_size window_size = self.window_size
if self.split_documents: if self.split_documents:
test.extend(splitter(test, split_policy=self.split_policy, window_size=window_size)) tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size)
test.extend(tests)
# initialize the document-by-feature vector # initialize the document-by-feature vector
TEST = np.empty((len(test), 0)) TEST = np.empty((len(test), 0))

View File

@ -1,7 +1,8 @@
from util import disable_sklearn_warnings from util import disable_sklearn_warnings
from sklearn.metrics import f1_score from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, LeaveOneOut from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, GroupKFold, KFold, \
StratifiedKFold
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.svm import * from sklearn.svm import *
from data.features import * from data.features import *
@ -29,7 +30,7 @@ def f1(true_labels, predicted_labels):
class AuthorshipVerificator: class AuthorshipVerificator:
def __init__(self, nfolds=10, def __init__(self, nfolds=10,
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}, params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced']},
estimator=SVC): estimator=SVC):
self.nfolds = nfolds self.nfolds = nfolds
self.params = params self.params = params
@ -44,13 +45,16 @@ class AuthorshipVerificator:
self.probability = True self.probability = True
self.svm = LogisticRegression() self.svm = LogisticRegression()
def fit(self,X,y): def fit(self,X,y,groups=None):
if not isinstance(y,np.ndarray): y=np.array(y) if not isinstance(y,np.ndarray): y=np.array(y)
positive_examples = y.sum() positive_examples = y.sum()
if True or positive_examples >= self.nfolds: if positive_examples >= self.nfolds:
print('optimizing {}'.format(self.svm.__class__.__name__)) print('optimizing {}'.format(self.svm.__class__.__name__))
self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=LeaveOneOut(), scoring=make_scorer(f1), n_jobs=-1, verbose=10) # if groups is None or len(np.unique(groups[y==1])):
# self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1), n_jobs=-1, verbose=10) folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
# folds = list(GroupKFold(n_splits=self.nfolds).split(X,y,groups))
self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
else: else:
self.estimator = self.svm self.estimator = self.svm
@ -62,9 +66,28 @@ class AuthorshipVerificator:
f1scores = self.estimator.best_score_ f1scores = self.estimator.best_score_
f1_mean, f1_std = f1scores.mean(), f1scores.std() f1_mean, f1_std = f1scores.mean(), f1scores.std()
print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores)) print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores))
self.estimator = self.estimator.best_estimator_
return self return self
def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True):
if groups is None:
print('Computing LOO without groups')
folds = list(LeaveOneOut().split(X,y))
else:
print('Computing LOO with groups')
logo = LeaveOneGroupOut()
folds=list(logo.split(X,y,groups))
if test_lowest_index_only:
print('ignoring fragments')
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
print(scores)
return scores.mean(), scores.std()
def predict(self, test, epistola_name=''): def predict(self, test, epistola_name=''):
pred = self.estimator.predict(test) pred = self.estimator.predict(test)
full_doc_prediction = pred[0] full_doc_prediction = pred[0]