print missclassified

This commit is contained in:
Alejandro Moreo Fernandez 2019-06-10 14:10:57 +02:00
parent 56770446bd
commit 23e62162b5
7 changed files with 46 additions and 34 deletions

View File

@ -59,29 +59,29 @@ def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5
# plt.show() # plt.show()
plt.savefig(path) plt.savefig(path)
import sys # import sys
for epistola in [1]: # for epistola in [1]:
if epistola == 1: # if epistola == 1:
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna'] # authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
paragraph_offset = 1 # paragraph_offset = 1
figsize=(3,9) # figsize=(3,9)
label_offset=0.2 # label_offset=0.2
#
else: # else:
authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', # authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', # 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
'GrazioloBambaglioli', 'GuidoDaPisa', # 'GrazioloBambaglioli', 'GuidoDaPisa',
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', # 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', # 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
'PietroAlighieri', 'RaimundusLullus', # 'PietroAlighieri', 'RaimundusLullus',
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] # 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
paragraph_offset = 14 # paragraph_offset = 14
figsize = (6,20) # figsize = (6,20)
label_offset=0.3 # label_offset=0.3
#
attributions = np.load(f'attribution_ep{epistola}.npy') # attributions = np.load(f'attribution_ep{epistola}.npy')
plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset) # plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
sys.exit(0) # sys.exit(0)
for epistola in [1]: for epistola in [1]:
@ -116,7 +116,7 @@ for epistola in [1]:
print('=' * 80) print('=' * 80)
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs] target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target) positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
# if len(positive) < 2: # if len(positive) < 2:
# discarded += 1 # discarded += 1
# continue # continue

View File

@ -111,7 +111,7 @@ for epistola in [1]:
print('=' * 80) print('=' * 80)
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)] target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)]
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII') positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
n_full_docs = len(positive) + len(negative) n_full_docs = len(positive) + len(negative)

View File

@ -37,7 +37,8 @@ for epistola in [1]:
if epistola==2: if epistola==2:
path+='_interaEpistola' path+='_interaEpistola'
positive, negative, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola)) positive, negative, pos_files, neg_files, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
files = np.asarray(pos_files + neg_files)
if len(positive) < 2: if len(positive) < 2:
discarded+=1 discarded+=1
continue continue
@ -55,6 +56,7 @@ for epistola in [1]:
split_documents=True, split_policy=split_by_sentences, window_size=3, split_documents=True, split_policy=split_by_sentences, window_size=3,
normalize_features=True) normalize_features=True)
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative) Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
print(ytr) print(ytr)
@ -64,7 +66,7 @@ for epistola in [1]:
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression) av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
av.fit(Xtr,ytr,groups) av.fit(Xtr,ytr,groups)
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True) score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True)
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std)) # print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
f1_scores.append(f1_from_counters(tp, fp, fn, tn)) f1_scores.append(f1_from_counters(tp, fp, fn, tn))
counters.append((tp, fp, fn, tn)) counters.append((tp, fp, fn, tn))

View File

@ -15,7 +15,7 @@ import os
# TODO: sentence length (Mendenhall-style) ? # TODO: sentence length (Mendenhall-style) ?
for epistola in [1]: for epistola in [2]:
print('Epistola {}'.format(epistola)) print('Epistola {}'.format(epistola))
print('='*80) print('='*80)
@ -26,7 +26,7 @@ for epistola in [1]:
paragraphs = range(14, 91) paragraphs = range(14, 91)
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs] target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target) positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target)
pickle_file = f'../dante_color/epistola{epistola}.pkl' pickle_file = f'../dante_color/epistola{epistola}.pkl'
if os.path.exists(pickle_file): if os.path.exists(pickle_file):
@ -35,6 +35,7 @@ for epistola in [1]:
for prob,text in zip(probabilities,ep_texts): for prob,text in zip(probabilities,ep_texts):
text = text.replace('\n','') text = text.replace('\n','')
print(f"{prob:.3f}:{text}") print(f"{prob:.3f}:{text}")
print(f'media={np.asarray(probabilities[1:]).mean()}')
else: else:
print(f'generating pickle file') print(f'generating pickle file')
n_full_docs = len(positive) + len(negative) n_full_docs = len(positive) + len(negative)
@ -50,6 +51,7 @@ for epistola in [1]:
split_documents=True, split_policy=split_by_sentences, window_size=3, split_documents=True, split_policy=split_by_sentences, window_size=3,
normalize_features=True) normalize_features=True)
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative) Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
print(ytr) print(ytr)

View File

@ -15,7 +15,7 @@ for epistola in [1,2,3]: #3 means "both Ep1 and Ep2 corpora"
paragraphs = range(1, 6) paragraphs = range(1, 6)
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs] target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs]
positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII') positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl' pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl'
if os.path.exists(pickle_file): if os.path.exists(pickle_file):

View File

@ -3,6 +3,8 @@ from os.path import join
import re import re
import collections import collections
# ------------------------------------------------------------------------ # ------------------------------------------------------------------------
# document loading routine # document loading routine
# ------------------------------------------------------------------------ # ------------------------------------------------------------------------
@ -30,7 +32,8 @@ def remove_citations(doc):
def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_prefix='EpistolaXIII_'): def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_prefix='EpistolaXIII_'):
# load the training data (all documents but Epistolas 1 and 2) # load the training data (all documents but Epistolas 1 and 2)
positive,negative = [],[] positive, negative = [], []
files_positive, files_negative = [], []
authors = [] authors = []
ndocs=0 ndocs=0
for file in os.listdir(path): for file in os.listdir(path):
@ -42,8 +45,10 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr
if author == positive_author: if author == positive_author:
positive.append(text) positive.append(text)
files_positive.append(file)
else: else:
negative.append(text) negative.append(text)
files_negative.append(file)
authors.append(author) authors.append(author)
ndocs+=1 ndocs+=1
@ -57,10 +62,10 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr
unknown = remove_citations(unknown) unknown = remove_citations(unknown)
unknowns.append(unknown) unknowns.append(unknown)
if len(unknowns) == 1: unknowns = unknowns[0] if len(unknowns) == 1: unknowns = unknowns[0]
return positive, negative, unknowns return positive, negative, files_positive, files_negative, unknowns
else: else:
return positive, negative return positive, negative, files_positive, files_negative
def ___list_texts(path): def ___list_texts(path):

View File

@ -80,7 +80,7 @@ class AuthorshipVerificator:
return self return self
def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True, counters=False): def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
if groups is None: if groups is None:
print('Computing LOO without groups') print('Computing LOO without groups')
@ -94,7 +94,10 @@ class AuthorshipVerificator:
folds = [(train, np.min(test, keepdims=True)) for train, test in folds] folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1) scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
missclassified = '\n'.join(files[scores==0].tolist())
print(scores) print(scores)
print(missclassified)
if counters and test_lowest_index_only: if counters and test_lowest_index_only:
yfull_true = y[:len(folds)] yfull_true = y[:len(folds)]
yfull_predict = np.zeros_like(yfull_true) yfull_predict = np.zeros_like(yfull_true)