print missclassified

This commit is contained in:
Alejandro Moreo Fernandez 2019-06-10 14:10:57 +02:00
parent 56770446bd
commit 23e62162b5
7 changed files with 46 additions and 34 deletions

View File

@ -59,29 +59,29 @@ def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5
# plt.show()
plt.savefig(path)
import sys
for epistola in [1]:
if epistola == 1:
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
paragraph_offset = 1
figsize=(3,9)
label_offset=0.2
else:
authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
'GrazioloBambaglioli', 'GuidoDaPisa',
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
'PietroAlighieri', 'RaimundusLullus',
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
paragraph_offset = 14
figsize = (6,20)
label_offset=0.3
attributions = np.load(f'attribution_ep{epistola}.npy')
plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
sys.exit(0)
# import sys
# for epistola in [1]:
# if epistola == 1:
# authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
# paragraph_offset = 1
# figsize=(3,9)
# label_offset=0.2
#
# else:
# authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
# 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
# 'GrazioloBambaglioli', 'GuidoDaPisa',
# 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
# 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
# 'PietroAlighieri', 'RaimundusLullus',
# 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
# paragraph_offset = 14
# figsize = (6,20)
# label_offset=0.3
#
# attributions = np.load(f'attribution_ep{epistola}.npy')
# plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
# sys.exit(0)
for epistola in [1]:
@ -116,7 +116,7 @@ for epistola in [1]:
print('=' * 80)
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
# if len(positive) < 2:
# discarded += 1
# continue

View File

@ -111,7 +111,7 @@ for epistola in [1]:
print('=' * 80)
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)]
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
n_full_docs = len(positive) + len(negative)

View File

@ -37,7 +37,8 @@ for epistola in [1]:
if epistola==2:
path+='_interaEpistola'
positive, negative, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
positive, negative, pos_files, neg_files, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
files = np.asarray(pos_files + neg_files)
if len(positive) < 2:
discarded+=1
continue
@ -55,6 +56,7 @@ for epistola in [1]:
split_documents=True, split_policy=split_by_sentences, window_size=3,
normalize_features=True)
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
print(ytr)
@ -64,7 +66,7 @@ for epistola in [1]:
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
av.fit(Xtr,ytr,groups)
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True)
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True)
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
counters.append((tp, fp, fn, tn))

View File

@ -15,7 +15,7 @@ import os
# TODO: sentence length (Mendenhall-style) ?
for epistola in [1]:
for epistola in [2]:
print('Epistola {}'.format(epistola))
print('='*80)
@ -26,7 +26,7 @@ for epistola in [1]:
paragraphs = range(14, 91)
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target)
positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target)
pickle_file = f'../dante_color/epistola{epistola}.pkl'
if os.path.exists(pickle_file):
@ -35,6 +35,7 @@ for epistola in [1]:
for prob,text in zip(probabilities,ep_texts):
text = text.replace('\n','')
print(f"{prob:.3f}:{text}")
print(f'media={np.asarray(probabilities[1:]).mean()}')
else:
print(f'generating pickle file')
n_full_docs = len(positive) + len(negative)
@ -50,6 +51,7 @@ for epistola in [1]:
split_documents=True, split_policy=split_by_sentences, window_size=3,
normalize_features=True)
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
print(ytr)

View File

@ -15,7 +15,7 @@ for epistola in [1,2,3]: #3 means "both Ep1 and Ep2 corpora"
paragraphs = range(1, 6)
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs]
positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl'
if os.path.exists(pickle_file):

View File

@ -3,6 +3,8 @@ from os.path import join
import re
import collections
# ------------------------------------------------------------------------
# document loading routine
# ------------------------------------------------------------------------
@ -31,6 +33,7 @@ def remove_citations(doc):
def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_prefix='EpistolaXIII_'):
# load the training data (all documents but Epistolas 1 and 2)
positive, negative = [], []
files_positive, files_negative = [], []
authors = []
ndocs=0
for file in os.listdir(path):
@ -42,8 +45,10 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr
if author == positive_author:
positive.append(text)
files_positive.append(file)
else:
negative.append(text)
files_negative.append(file)
authors.append(author)
ndocs+=1
@ -57,10 +62,10 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr
unknown = remove_citations(unknown)
unknowns.append(unknown)
if len(unknowns) == 1: unknowns = unknowns[0]
return positive, negative, unknowns
return positive, negative, files_positive, files_negative, unknowns
else:
return positive, negative
return positive, negative, files_positive, files_negative
def ___list_texts(path):

View File

@ -80,7 +80,7 @@ class AuthorshipVerificator:
return self
def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True, counters=False):
def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
if groups is None:
print('Computing LOO without groups')
@ -94,7 +94,10 @@ class AuthorshipVerificator:
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
missclassified = '\n'.join(files[scores==0].tolist())
print(scores)
print(missclassified)
if counters and test_lowest_index_only:
yfull_true = y[:len(folds)]
yfull_predict = np.zeros_like(yfull_true)