From 23e62162b561eae71aaf4a93d795f9a2a93f1df4 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Mon, 10 Jun 2019 14:10:57 +0200 Subject: [PATCH] print missclassified --- src/author_attribution.py | 48 +++++++++++++++++----------------- src/author_attribution_XIV.py | 2 +- src/author_identification.py | 6 +++-- src/author_verification.py | 6 +++-- src/author_verification_XIV.py | 2 +- src/data/dante_loader.py | 11 +++++--- src/model.py | 5 +++- 7 files changed, 46 insertions(+), 34 deletions(-) diff --git a/src/author_attribution.py b/src/author_attribution.py index 75c5633..7d168d3 100755 --- a/src/author_attribution.py +++ b/src/author_attribution.py @@ -59,29 +59,29 @@ def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5 # plt.show() plt.savefig(path) -import sys -for epistola in [1]: - if epistola == 1: - authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna'] - paragraph_offset = 1 - figsize=(3,9) - label_offset=0.2 - - else: - authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', - 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', - 'GrazioloBambaglioli', 'GuidoDaPisa', - 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', - 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', - 'PietroAlighieri', 'RaimundusLullus', - 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] - paragraph_offset = 14 - figsize = (6,20) - label_offset=0.3 - - attributions = np.load(f'attribution_ep{epistola}.npy') - plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset) -sys.exit(0) +# import sys +# for epistola in [1]: +# if epistola == 1: +# authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna'] +# paragraph_offset = 1 +# figsize=(3,9) +# label_offset=0.2 +# +# else: +# authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', +# 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', +# 'GrazioloBambaglioli', 'GuidoDaPisa', +# 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', +# 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', +# 'PietroAlighieri', 'RaimundusLullus', +# 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] +# paragraph_offset = 14 +# figsize = (6,20) +# label_offset=0.3 +# +# attributions = np.load(f'attribution_ep{epistola}.npy') +# plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset) +# sys.exit(0) for epistola in [1]: @@ -116,7 +116,7 @@ for epistola in [1]: print('=' * 80) target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs] - positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target) + positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target) # if len(positive) < 2: # discarded += 1 # continue diff --git a/src/author_attribution_XIV.py b/src/author_attribution_XIV.py index 6d810a0..2d5b75b 100755 --- a/src/author_attribution_XIV.py +++ b/src/author_attribution_XIV.py @@ -111,7 +111,7 @@ for epistola in [1]: print('=' * 80) target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)] - positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII') + positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII') n_full_docs = len(positive) + len(negative) diff --git a/src/author_identification.py b/src/author_identification.py index 2ec0833..1cbe4ca 100755 --- a/src/author_identification.py +++ b/src/author_identification.py @@ -37,7 +37,8 @@ for epistola in [1]: if epistola==2: path+='_interaEpistola' - positive, negative, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola)) + positive, negative, pos_files, neg_files, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola)) + files = np.asarray(pos_files + neg_files) if len(positive) < 2: discarded+=1 continue @@ -55,6 +56,7 @@ for epistola in [1]: split_documents=True, split_policy=split_by_sentences, window_size=3, normalize_features=True) + Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative) print(ytr) @@ -64,7 +66,7 @@ for epistola in [1]: av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression) av.fit(Xtr,ytr,groups) - score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True) + score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True) # print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std)) f1_scores.append(f1_from_counters(tp, fp, fn, tn)) counters.append((tp, fp, fn, tn)) diff --git a/src/author_verification.py b/src/author_verification.py index 9a16258..8daa5fb 100755 --- a/src/author_verification.py +++ b/src/author_verification.py @@ -15,7 +15,7 @@ import os # TODO: sentence length (Mendenhall-style) ? -for epistola in [1]: +for epistola in [2]: print('Epistola {}'.format(epistola)) print('='*80) @@ -26,7 +26,7 @@ for epistola in [1]: paragraphs = range(14, 91) target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs] - positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target) + positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target) pickle_file = f'../dante_color/epistola{epistola}.pkl' if os.path.exists(pickle_file): @@ -35,6 +35,7 @@ for epistola in [1]: for prob,text in zip(probabilities,ep_texts): text = text.replace('\n','') print(f"{prob:.3f}:{text}") + print(f'media={np.asarray(probabilities[1:]).mean()}') else: print(f'generating pickle file') n_full_docs = len(positive) + len(negative) @@ -50,6 +51,7 @@ for epistola in [1]: split_documents=True, split_policy=split_by_sentences, window_size=3, normalize_features=True) + Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative) print(ytr) diff --git a/src/author_verification_XIV.py b/src/author_verification_XIV.py index 9813f5d..a3754bc 100755 --- a/src/author_verification_XIV.py +++ b/src/author_verification_XIV.py @@ -15,7 +15,7 @@ for epistola in [1,2,3]: #3 means "both Ep1 and Ep2 corpora" paragraphs = range(1, 6) target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs] - positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII') + positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII') pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl' if os.path.exists(pickle_file): diff --git a/src/data/dante_loader.py b/src/data/dante_loader.py index ccc2a5e..535ec34 100755 --- a/src/data/dante_loader.py +++ b/src/data/dante_loader.py @@ -3,6 +3,8 @@ from os.path import join import re import collections + + # ------------------------------------------------------------------------ # document loading routine # ------------------------------------------------------------------------ @@ -30,7 +32,8 @@ def remove_citations(doc): def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_prefix='EpistolaXIII_'): # load the training data (all documents but Epistolas 1 and 2) - positive,negative = [],[] + positive, negative = [], [] + files_positive, files_negative = [], [] authors = [] ndocs=0 for file in os.listdir(path): @@ -42,8 +45,10 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr if author == positive_author: positive.append(text) + files_positive.append(file) else: negative.append(text) + files_negative.append(file) authors.append(author) ndocs+=1 @@ -57,10 +62,10 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr unknown = remove_citations(unknown) unknowns.append(unknown) if len(unknowns) == 1: unknowns = unknowns[0] - return positive, negative, unknowns + return positive, negative, files_positive, files_negative, unknowns else: - return positive, negative + return positive, negative, files_positive, files_negative def ___list_texts(path): diff --git a/src/model.py b/src/model.py index fd22d96..c5d28f9 100755 --- a/src/model.py +++ b/src/model.py @@ -80,7 +80,7 @@ class AuthorshipVerificator: return self - def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True, counters=False): + def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False): if groups is None: print('Computing LOO without groups') @@ -94,7 +94,10 @@ class AuthorshipVerificator: folds = [(train, np.min(test, keepdims=True)) for train, test in folds] scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1) + missclassified = '\n'.join(files[scores==0].tolist()) print(scores) + print(missclassified) + if counters and test_lowest_index_only: yfull_true = y[:len(folds)] yfull_predict = np.zeros_like(yfull_true)