print missclassified

2019-06-10 14:10:57 +02:00 · 2019-06-10 14:10:57 +02:00 · 23e62162b5
parent 56770446bd
commit 23e62162b5
7 changed files with 46 additions and 34 deletions
--- a/src/author_attribution.py
+++ b/src/author_attribution.py
@ -59,29 +59,29 @@ def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5
    # plt.show()
    plt.savefig(path)
-import sys
+# import sys
-for epistola in [1]:
+# for epistola in [1]:
-    if epistola == 1:
+#     if epistola == 1:
-        authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
+#         authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
-        paragraph_offset = 1
+#         paragraph_offset = 1
-        figsize=(3,9)
+#         figsize=(3,9)
-        label_offset=0.2
+#         label_offset=0.2
-
+#
-    else:
+#     else:
-        authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
+#         authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
-                   'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
+#                    'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
-                   'GrazioloBambaglioli', 'GuidoDaPisa',
+#                    'GrazioloBambaglioli', 'GuidoDaPisa',
-                   'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
+#                    'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
-                   'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
+#                    'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
-                   'PietroAlighieri', 'RaimundusLullus',
+#                    'PietroAlighieri', 'RaimundusLullus',
-                   'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
+#                    'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
-        paragraph_offset = 14
+#         paragraph_offset = 14
-        figsize = (6,20)
+#         figsize = (6,20)
-        label_offset=0.3
+#         label_offset=0.3
-
+#
-    attributions = np.load(f'attribution_ep{epistola}.npy')
+#     attributions = np.load(f'attribution_ep{epistola}.npy')
-    plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
+#     plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
-sys.exit(0)
+# sys.exit(0)
 for epistola in [1]:
@ -116,7 +116,7 @@ for epistola in [1]:
        print('=' * 80)
        target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
-        positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
+        positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
        # if len(positive) < 2:
        #     discarded += 1
        #     continue
--- a/src/author_attribution_XIV.py
+++ b/src/author_attribution_XIV.py
@ -111,7 +111,7 @@ for epistola in [1]:
        print('=' * 80)
        target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)]
-        positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
+        positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
        n_full_docs = len(positive) + len(negative)
--- a/src/author_identification.py
+++ b/src/author_identification.py
@ -37,7 +37,8 @@ for epistola in [1]:
        if epistola==2:
            path+='_interaEpistola'
-        positive, negative, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
+        positive, negative, pos_files, neg_files, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
        files = np.asarray(pos_files + neg_files)
        if len(positive) < 2:
            discarded+=1
            continue
@ -55,6 +56,7 @@ for epistola in [1]:
                                             split_documents=True, split_policy=split_by_sentences, window_size=3,
                                             normalize_features=True)
        Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
        print(ytr)
@ -64,7 +66,7 @@ for epistola in [1]:
        av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
        av.fit(Xtr,ytr,groups)
-        score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True)
+        score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True)
        # print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
        f1_scores.append(f1_from_counters(tp, fp, fn, tn))
        counters.append((tp, fp, fn, tn))
--- a/src/author_verification.py
+++ b/src/author_verification.py
@ -15,7 +15,7 @@ import os
 # TODO: sentence length (Mendenhall-style) ?
-for epistola in [1]:
+for epistola in [2]:
    print('Epistola {}'.format(epistola))
    print('='*80)
@ -26,7 +26,7 @@ for epistola in [1]:
        paragraphs = range(14, 91)
    target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
-    positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target)
+    positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target)
    pickle_file = f'../dante_color/epistola{epistola}.pkl'
    if os.path.exists(pickle_file):
@ -35,6 +35,7 @@ for epistola in [1]:
        for prob,text in zip(probabilities,ep_texts):
            text = text.replace('\n','')
            print(f"{prob:.3f}:{text}")
        print(f'media={np.asarray(probabilities[1:]).mean()}')
    else:
        print(f'generating pickle file')
        n_full_docs = len(positive) + len(negative)
@ -50,6 +51,7 @@ for epistola in [1]:
                                             split_documents=True, split_policy=split_by_sentences, window_size=3,
                                             normalize_features=True)
        Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
        print(ytr)
--- a/src/author_verification_XIV.py
+++ b/src/author_verification_XIV.py
@ -15,7 +15,7 @@ for epistola in [1,2,3]: #3 means "both Ep1 and Ep2 corpora"
    paragraphs = range(1, 6)
    target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs]
-    positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
+    positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
    pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl'
    if os.path.exists(pickle_file):
--- a/src/data/dante_loader.py
+++ b/src/data/dante_loader.py
@ -3,6 +3,8 @@ from os.path import join
 import re
 import collections
 # ------------------------------------------------------------------------
 # document loading routine
 # ------------------------------------------------------------------------
@ -30,7 +32,8 @@ def remove_citations(doc):
 def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_prefix='EpistolaXIII_'):
    # load the training data (all documents but Epistolas 1 and 2)
-    positive,negative = [],[]
+    positive, negative = [], []
    files_positive, files_negative = [], []
    authors   = []
    ndocs=0
    for file in os.listdir(path):
@ -42,8 +45,10 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr
        if author == positive_author:
            positive.append(text)
            files_positive.append(file)
        else:
            negative.append(text)
            files_negative.append(file)
        authors.append(author)
        ndocs+=1
@ -57,10 +62,10 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr
            unknown = remove_citations(unknown)
            unknowns.append(unknown)
        if len(unknowns) == 1: unknowns = unknowns[0]
-        return positive, negative, unknowns
+        return positive, negative, files_positive, files_negative, unknowns
    else:
-        return positive, negative
+        return positive, negative, files_positive, files_negative
 def ___list_texts(path):
--- a/src/model.py
+++ b/src/model.py
@ -80,7 +80,7 @@ class AuthorshipVerificator:
        return self
-    def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True, counters=False):
+    def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
        if groups is None:
            print('Computing LOO without groups')
@ -94,7 +94,10 @@ class AuthorshipVerificator:
                folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
        scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
        missclassified = '\n'.join(files[scores==0].tolist())
        print(scores)
        print(missclassified)
        if counters and test_lowest_index_only:
            yfull_true = y[:len(folds)]
            yfull_predict = np.zeros_like(yfull_true)