LOO validation with document-groups and gridsearch with StratifiedKFold
This commit is contained in:
parent
b2ab5556e1
commit
9a0ad7dd3d
|
|
@ -12,7 +12,7 @@ from util.color_visualization import color
|
||||||
# TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection
|
# TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection
|
||||||
# TODO: sentence length (Mendenhall-style) ?
|
# TODO: sentence length (Mendenhall-style) ?
|
||||||
|
|
||||||
for epistola in [1,2]:
|
for epistola in [2]:
|
||||||
print('Epistola {}'.format(epistola))
|
print('Epistola {}'.format(epistola))
|
||||||
print('='*80)
|
print('='*80)
|
||||||
path = '../testi_{}'.format(epistola)
|
path = '../testi_{}'.format(epistola)
|
||||||
|
|
@ -20,6 +20,7 @@ for epistola in [1,2]:
|
||||||
path+='_with_GuidoDaPisa'
|
path+='_with_GuidoDaPisa'
|
||||||
|
|
||||||
positive, negative, ep_text = load_texts(path, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
|
positive, negative, ep_text = load_texts(path, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
|
||||||
|
n_full_docs = len(positive) + len(negative)
|
||||||
|
|
||||||
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||||
conjugations_freq='latin',
|
conjugations_freq='latin',
|
||||||
|
|
@ -27,15 +28,17 @@ for epistola in [1,2]:
|
||||||
tfidf_feat_selection_ratio=0.1,
|
tfidf_feat_selection_ratio=0.1,
|
||||||
wordngrams=False, n_wordngrams=(1, 2),
|
wordngrams=False, n_wordngrams=(1, 2),
|
||||||
charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False,
|
charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False,
|
||||||
split_documents=False, split_policy=split_by_sentences, window_size=3,
|
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||||
normalize_features=True)
|
normalize_features=True)
|
||||||
|
|
||||||
Xtr,ytr = feature_extractor.fit_transform(positive, negative)
|
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
||||||
|
print(ytr)
|
||||||
|
|
||||||
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
|
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
|
||||||
|
|
||||||
print('Fitting the Verificator')
|
print('Fitting the Verificator')
|
||||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
|
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
|
||||||
av.fit(Xtr,ytr)
|
av.fit(Xtr,ytr,groups)
|
||||||
|
|
||||||
print('Predicting the Epistola {}'.format(epistola))
|
print('Predicting the Epistola {}'.format(epistola))
|
||||||
title = 'Epistola {}'.format('I' if epistola==1 else 'II')
|
title = 'Epistola {}'.format('I' if epistola==1 else 'II')
|
||||||
|
|
@ -43,8 +46,12 @@ for epistola in [1,2]:
|
||||||
fulldoc_prob, fragment_probs = av.predict_proba(ep, title)
|
fulldoc_prob, fragment_probs = av.predict_proba(ep, title)
|
||||||
# color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title)
|
# color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title)
|
||||||
|
|
||||||
param = 'All'
|
score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False)
|
||||||
# with open('features{}.csv'.format(epistola), 'at') as fo:
|
print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||||
# validation=av.estimator.best_score_.mean()
|
|
||||||
# nfeatures = Xtr.shape[1]
|
score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True)
|
||||||
# fo.write('{}\t{}\t{:.0f}\t{:.3f}\t{:.3f}\n'.format(param, value, nfeatures, validation, fulldoc_prob))
|
print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||||
|
|
||||||
|
score_ave, score_std = av.leave_one_out(Xtr, ytr, None)
|
||||||
|
print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -49,6 +49,10 @@ latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amus
|
||||||
'sim', 'sis', 'sit', 'simus', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemus', 'essetis', 'essent',
|
'sim', 'sis', 'sit', 'simus', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemus', 'essetis', 'essent',
|
||||||
'fui', 'fuisti', 'fuit', 'fuimus', 'fuistis', 'fuerunt', 'este', 'esto', 'estote', 'sunto']
|
'fui', 'fuisti', 'fuit', 'fuimus', 'fuistis', 'fuerunt', 'este', 'esto', 'estote', 'sunto']
|
||||||
|
|
||||||
|
spanish_conjugations = ['o','as','a','amos','áis','an','es','e','emos','éis','en','imos','ís','guir','ger','gir',
|
||||||
|
'ar','er','ir','é','aste','ó','asteis','aron','í','iste','ió','isteis','ieron',
|
||||||
|
'aba', 'abas', 'ábamos', 'aban', 'ía', 'ías', 'íamos', 'íais', 'ían', 'ás','á',
|
||||||
|
'án','estoy','estás','está','estamos','estáis','están']
|
||||||
|
|
||||||
def get_conjugations(lang):
|
def get_conjugations(lang):
|
||||||
if lang == 'latin':
|
if lang == 'latin':
|
||||||
|
|
@ -95,17 +99,19 @@ def windows(text_fragments, window_size):
|
||||||
def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1):
|
def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1):
|
||||||
fragments = []
|
fragments = []
|
||||||
authors_fragments = []
|
authors_fragments = []
|
||||||
|
groups = []
|
||||||
for i, text in enumerate(documents):
|
for i, text in enumerate(documents):
|
||||||
text_fragments = split_policy(text)
|
text_fragments = split_policy(text)
|
||||||
text_fragments = windows(text_fragments, window_size=window_size)
|
text_fragments = windows(text_fragments, window_size=window_size)
|
||||||
fragments.extend(text_fragments)
|
fragments.extend(text_fragments)
|
||||||
|
groups.extend([i]*len(text_fragments))
|
||||||
if authors is not None:
|
if authors is not None:
|
||||||
authors_fragments.extend([authors[i]] * len(text_fragments))
|
authors_fragments.extend([authors[i]] * len(text_fragments))
|
||||||
|
|
||||||
if authors is not None:
|
if authors is not None:
|
||||||
return fragments, authors_fragments
|
return fragments, authors_fragments, groups
|
||||||
|
|
||||||
return fragments
|
return fragments, groups
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text):
|
def tokenize(text):
|
||||||
|
|
@ -280,17 +286,20 @@ class FeatureExtractor:
|
||||||
documents = positives + negatives
|
documents = positives + negatives
|
||||||
authors = [1]*len(positives) + [0]*len(negatives)
|
authors = [1]*len(positives) + [0]*len(negatives)
|
||||||
n_original_docs = len(documents)
|
n_original_docs = len(documents)
|
||||||
|
groups = list(range(n_original_docs))
|
||||||
|
|
||||||
if self.split_documents:
|
if self.split_documents:
|
||||||
doc_fragments, authors_fragments = splitter(documents, authors,
|
doc_fragments, authors_fragments, groups_fragments = splitter(documents, authors,
|
||||||
split_policy=self.split_policy,
|
split_policy=self.split_policy,
|
||||||
window_size=self.window_size)
|
window_size=self.window_size)
|
||||||
documents.extend(doc_fragments)
|
documents.extend(doc_fragments)
|
||||||
authors.extend(authors_fragments)
|
authors.extend(authors_fragments)
|
||||||
|
groups.extend(groups_fragments)
|
||||||
self._print('splitting documents: {} documents'.format(len(doc_fragments)))
|
self._print('splitting documents: {} documents'.format(len(doc_fragments)))
|
||||||
|
|
||||||
# represent the target vector
|
# represent the target vector
|
||||||
y = np.array(authors)
|
y = np.array(authors)
|
||||||
|
groups = np.array(groups)
|
||||||
|
|
||||||
# initialize the document-by-feature vector
|
# initialize the document-by-feature vector
|
||||||
X = np.empty((len(documents), 0))
|
X = np.empty((len(documents), 0))
|
||||||
|
|
@ -345,7 +354,7 @@ class FeatureExtractor:
|
||||||
print('y prevalence: {}/{} {:.2f}%'.format(y.sum(),len(y),y.mean() * 100))
|
print('y prevalence: {}/{} {:.2f}%'.format(y.sum(),len(y),y.mean() * 100))
|
||||||
print()
|
print()
|
||||||
|
|
||||||
return X, y
|
return X, y, groups
|
||||||
|
|
||||||
|
|
||||||
def transform(self, test, return_fragments=False, window_size=-1):
|
def transform(self, test, return_fragments=False, window_size=-1):
|
||||||
|
|
@ -354,7 +363,8 @@ class FeatureExtractor:
|
||||||
window_size = self.window_size
|
window_size = self.window_size
|
||||||
|
|
||||||
if self.split_documents:
|
if self.split_documents:
|
||||||
test.extend(splitter(test, split_policy=self.split_policy, window_size=window_size))
|
tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size)
|
||||||
|
test.extend(tests)
|
||||||
|
|
||||||
# initialize the document-by-feature vector
|
# initialize the document-by-feature vector
|
||||||
TEST = np.empty((len(test), 0))
|
TEST = np.empty((len(test), 0))
|
||||||
|
|
|
||||||
35
src/model.py
35
src/model.py
|
|
@ -1,7 +1,8 @@
|
||||||
from util import disable_sklearn_warnings
|
from util import disable_sklearn_warnings
|
||||||
from sklearn.metrics import f1_score
|
from sklearn.metrics import f1_score
|
||||||
from sklearn.metrics import make_scorer
|
from sklearn.metrics import make_scorer
|
||||||
from sklearn.model_selection import GridSearchCV, LeaveOneOut
|
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, GroupKFold, KFold, \
|
||||||
|
StratifiedKFold
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.svm import *
|
from sklearn.svm import *
|
||||||
from data.features import *
|
from data.features import *
|
||||||
|
|
@ -29,7 +30,7 @@ def f1(true_labels, predicted_labels):
|
||||||
class AuthorshipVerificator:
|
class AuthorshipVerificator:
|
||||||
|
|
||||||
def __init__(self, nfolds=10,
|
def __init__(self, nfolds=10,
|
||||||
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]},
|
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced']},
|
||||||
estimator=SVC):
|
estimator=SVC):
|
||||||
self.nfolds = nfolds
|
self.nfolds = nfolds
|
||||||
self.params = params
|
self.params = params
|
||||||
|
|
@ -44,13 +45,16 @@ class AuthorshipVerificator:
|
||||||
self.probability = True
|
self.probability = True
|
||||||
self.svm = LogisticRegression()
|
self.svm = LogisticRegression()
|
||||||
|
|
||||||
def fit(self,X,y):
|
def fit(self,X,y,groups=None):
|
||||||
if not isinstance(y,np.ndarray): y=np.array(y)
|
if not isinstance(y,np.ndarray): y=np.array(y)
|
||||||
positive_examples = y.sum()
|
positive_examples = y.sum()
|
||||||
if True or positive_examples >= self.nfolds:
|
if positive_examples >= self.nfolds:
|
||||||
print('optimizing {}'.format(self.svm.__class__.__name__))
|
print('optimizing {}'.format(self.svm.__class__.__name__))
|
||||||
self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=LeaveOneOut(), scoring=make_scorer(f1), n_jobs=-1, verbose=10)
|
# if groups is None or len(np.unique(groups[y==1])):
|
||||||
# self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1), n_jobs=-1, verbose=10)
|
folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y))
|
||||||
|
# folds = list(GroupKFold(n_splits=self.nfolds).split(X,y,groups))
|
||||||
|
|
||||||
|
self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
||||||
else:
|
else:
|
||||||
self.estimator = self.svm
|
self.estimator = self.svm
|
||||||
|
|
||||||
|
|
@ -62,9 +66,28 @@ class AuthorshipVerificator:
|
||||||
f1scores = self.estimator.best_score_
|
f1scores = self.estimator.best_score_
|
||||||
f1_mean, f1_std = f1scores.mean(), f1scores.std()
|
f1_mean, f1_std = f1scores.mean(), f1scores.std()
|
||||||
print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores))
|
print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores))
|
||||||
|
self.estimator = self.estimator.best_estimator_
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True):
|
||||||
|
|
||||||
|
if groups is None:
|
||||||
|
print('Computing LOO without groups')
|
||||||
|
folds = list(LeaveOneOut().split(X,y))
|
||||||
|
else:
|
||||||
|
print('Computing LOO with groups')
|
||||||
|
logo = LeaveOneGroupOut()
|
||||||
|
folds=list(logo.split(X,y,groups))
|
||||||
|
if test_lowest_index_only:
|
||||||
|
print('ignoring fragments')
|
||||||
|
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
|
||||||
|
|
||||||
|
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
||||||
|
print(scores)
|
||||||
|
|
||||||
|
return scores.mean(), scores.std()
|
||||||
|
|
||||||
def predict(self, test, epistola_name=''):
|
def predict(self, test, epistola_name=''):
|
||||||
pred = self.estimator.predict(test)
|
pred = self.estimator.predict(test)
|
||||||
full_doc_prediction = pred[0]
|
full_doc_prediction = pred[0]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue