cleaning

2019-06-04 09:05:25 +02:00 · 2019-06-04 09:05:25 +02:00 · 67adaa441c
parent 13d9a5ed57
commit 67adaa441c
5 changed files with 393 additions and 25 deletions
--- a/src/author_attribution.py
+++ b/src/author_attribution.py
@ -1,3 +1,4 @@
+from mpl_toolkits.axes_grid1 import make_axes_locatable
 from sklearn.linear_model import LogisticRegression
 from data.dante_loader import load_texts
 from data.features import *
@ -6,60 +7,104 @@ import numpy as np
 import matplotlib
 import matplotlib.pyplot as plt

-def plot_attribution(path, authors, attributions, paragraph_offset=1):
+def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5,5), label_offset=0.3):

-    paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[1]-1)]
+    attributions = attributions.T
+    print(attributions.shape)
+    # attributions=attributions>0.5
+    paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[0]-1)]

-    fig, ax = plt.subplots()
-    im = ax.imshow(attributions)
+    fig, ax = plt.subplots(figsize=figsize)
+
+    # im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greens')
+    im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greys')
+
+    # Create colorbar
+    # cbar = fig.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1, pad=0.04)
+    # ax.figure.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1)
+    # ax.figure.colorbar(im, ax=ax, orientation="horizontal", pad=0.05)
+
+    # cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")

    # We want to show all ticks...
-    ax.set_xticks(np.arange(len(paragraphs)))
-    ax.set_yticks(np.arange(len(authors)))
+    # ax.set_xticks(np.arange(len(authors)))
+    ax.set_xticks(np.arange(len(authors) + 0) + label_offset)
+    ax.set_yticks(np.arange(len(paragraphs)))
    # ... and label them with the respective list entries
-    ax.set_xticklabels(paragraphs)
-    ax.set_yticklabels(authors)
+    ax.set_xticklabels(authors)
+    ax.set_yticklabels(paragraphs)
+
+    ax.tick_params(top=False, bottom=False,
+                   labeltop=True, labelbottom=False)

    # Rotate the tick labels and set their alignment.
-    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
-             rotation_mode="anchor")
+    plt.setp(ax.get_xticklabels(), rotation=90, ha="left", rotation_mode="anchor")
+
+    for edge, spine in ax.spines.items():
+        spine.set_visible(False)
+
+    ax.set_xticks(np.arange(len(authors)+1) - .5, minor=True)
+    ax.set_yticks(np.arange(len(paragraphs)+1) - .5, minor=True)
+
+    ax.grid(which="minor", color="k", linestyle='-', linewidth=1)
+    ax.tick_params(which="minor", bottom=False, left=False)

    # Loop over data dimensions and create text annotations.
-    for i in range(len(authors)):
-        for j in range(len(paragraphs)):
-            text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
+    # for i in range(len(authors)):
+    #     for j in range(len(paragraphs)):
+    #         text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")

-    ax.set_title("Attribution matrix")
+    # ax.set_title("Attribution matrix")
    fig.tight_layout()
    # plt.show()
    plt.savefig(path)

 import sys
-authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
-attributions = np.load('attribution_ep1.npy')
-plot_attribution('plot1.pdf', authors, attributions)
+for epistola in [1]:
+    if epistola == 1:
+        authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
+        paragraph_offset = 1
+        figsize=(3,9)
+        label_offset=0.2
+
+    else:
+        authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
+                   'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
+                   'GrazioloBambaglioli', 'GuidoDaPisa',
+                   'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
+                   'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
+                   'PietroAlighieri', 'RaimundusLullus',
+                   'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
+        paragraph_offset = 14
+        figsize = (6,20)
+        label_offset=0.3
+
+    attributions = np.load(f'attribution_ep{epistola}.npy')
+    plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
 sys.exit(0)

-author_attribution = []
 for epistola in [1]:

+    author_attribution = []
    print(f'Epistola {epistola}')
    print('='*80)
    path = f'../testi_{epistola}'

    if epistola == 1:
        authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
-        paragraphs = range(1,3)
+        paragraphs = range(1,14)

    else:
-        authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
+        authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
                   'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
                   'GrazioloBambaglioli', 'GuidoDaPisa',
                   'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
-                   'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna',
+                   'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
                   'PietroAlighieri', 'RaimundusLullus',
                   'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
-        paragraphs = range(13, 90)
+        paragraphs = range(14, 91)
+        assert len(authors)==20, f'unexpected number of authors ({len(authors)})'
+        path+='_tutti'

    discarded = 0
    f1_scores = []
@ -72,9 +117,9 @@ for epistola in [1]:

        target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
        positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
-        if len(positive) < 2:
-            discarded += 1
-            continue
+        # if len(positive) < 2:
+        #     discarded += 1
+        #     continue

        n_full_docs = len(positive) + len(negative)

--- a/src/author_attribution_XIV.py
+++ b/src/author_attribution_XIV.py
@ -0,0 +1,152 @@
+from mpl_toolkits.axes_grid1 import make_axes_locatable
+from sklearn.linear_model import LogisticRegression
+from data.dante_loader import load_texts
+from data.features import *
+from model import AuthorshipVerificator, f1_from_counters
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+
+def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5,5), label_offset=0.3):
+
+    attributions = attributions.T
+    print(attributions.shape)
+    # attributions=attributions>0.5
+    paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[0]-1)]
+
+    fig, ax = plt.subplots(figsize=figsize)
+
+    # im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greens')
+    im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greys')
+
+    # Create colorbar
+    # cbar = fig.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1, pad=0.04)
+    # ax.figure.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1)
+    # ax.figure.colorbar(im, ax=ax, orientation="horizontal", pad=0.05)
+
+    # cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
+
+    # We want to show all ticks...
+    # ax.set_xticks(np.arange(len(authors)))
+    ax.set_xticks(np.arange(len(authors) + 0) + label_offset)
+    ax.set_yticks(np.arange(len(paragraphs)))
+    # ... and label them with the respective list entries
+    ax.set_xticklabels(authors)
+    ax.set_yticklabels(paragraphs)
+
+    ax.tick_params(top=False, bottom=False,
+                   labeltop=True, labelbottom=False)
+
+    # Rotate the tick labels and set their alignment.
+    plt.setp(ax.get_xticklabels(), rotation=90, ha="left", rotation_mode="anchor")
+
+    for edge, spine in ax.spines.items():
+        spine.set_visible(False)
+
+    ax.set_xticks(np.arange(len(authors)+1) - .5, minor=True)
+    ax.set_yticks(np.arange(len(paragraphs)+1) - .5, minor=True)
+
+    ax.grid(which="minor", color="k", linestyle='-', linewidth=1)
+    ax.tick_params(which="minor", bottom=False, left=False)
+
+    # Loop over data dimensions and create text annotations.
+    # for i in range(len(authors)):
+    #     for j in range(len(paragraphs)):
+    #         text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
+
+    # ax.set_title("Attribution matrix")
+    fig.tight_layout()
+    # plt.show()
+    plt.savefig(path)
+
+import sys
+authors1 = ['ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
+authors2 = ['BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
+                   'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
+                   'GrazioloBambaglioli', 'GuidoDaPisa',
+                   'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
+                   'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
+                   'PietroAlighieri', 'RaimundusLullus',
+                   'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
+authors3 = sorted(np.unique(authors1 + authors2).tolist())
+
+for epistola in [1]:
+    paragraph_offset = 1
+    label_offset = 0.2
+    if epistola == 1:
+        authors = ['Dante'] + authors1
+        figsize = (4, 4)
+    elif epistola == 2:
+        authors = ['Dante'] + authors2
+        figsize = (6, 4)
+    else:
+        authors = ['Dante'] + authors3
+
+    attributions = np.load(f'attribution_ep{epistola}_xiv.npy')
+    plot_attribution(f'plot{epistola}_xiv.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
+sys.exit(0)
+
+for epistola in [1]:
+
+    author_attribution = []
+    print(f'Epistola {epistola}')
+    print('='*80)
+    path = f'../testiXIV_{epistola}'
+
+
+    if epistola == 1:
+        authors = ['Dante'] + authors1
+    elif epistola == 2:
+        authors = ['Dante'] + authors2
+        path += '_tutti'
+    else:
+        authors = ['Dante'] + authors3
+
+    discarded = 0
+    f1_scores = []
+    counters = []
+    for i, author in enumerate(authors):
+        print('=' * 80)
+        print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
+        print('Corpus of Epistola {}'.format(epistola))
+        print('=' * 80)
+
+        target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)]
+        positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
+
+        n_full_docs = len(positive) + len(negative)
+
+        feature_extractor = FeatureExtractor(function_words_freq='latin',
+                                             conjugations_freq='latin',
+                                             features_Mendenhall=True,
+                                             features_sentenceLengths=True,
+                                             tfidf_feat_selection_ratio=0.1,
+                                             wordngrams=True, n_wordngrams=(1, 2),
+                                             charngrams=True, n_charngrams=(3, 4, 5),
+                                             preserve_punctuation=False,
+                                             split_documents=True, split_policy=split_by_sentences, window_size=3,
+                                             normalize_features=True)
+
+        Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
+
+        print('Fitting the Verificator')
+        av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
+        av.fit(Xtr, ytr, groups)
+
+        attributions=[]
+        for i,target_text in enumerate(ep_texts):
+            ep = feature_extractor.transform(target_text, avoid_splitting=True)
+            prob,_ = av.predict_proba(ep, epistola_name=target[i])
+            attributions.append(prob)
+        author_attribution.append(attributions)
+
+    author_attribution = np.asarray(author_attribution)
+    attribution_path = f'attribution_ep{epistola}_xiv.npy'
+    print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}')
+    np.save(attribution_path, author_attribution)
+
+
+
+
+
+
--- a/src/author_verification_XIV.py
+++ b/src/author_verification_XIV.py
@ -0,0 +1,77 @@
+from sklearn.linear_model import LogisticRegression
+from data.dante_loader import load_texts
+from data.features import *
+from model import AuthorshipVerificator, f1_from_counters
+from sklearn.svm import LinearSVC, SVC
+from util.color_visualization import color
+import pickle
+import os
+
+for epistola in [1,2,3]: #3 means "both Ep1 and Ep2 corpora"
+
+    print('Epistola {}'.format(epistola))
+    print('='*80)
+    path = '../testiXIV_{}'.format(epistola)
+    paragraphs = range(1, 6)
+    if epistola==2:
+        path+='_tutti'
+
+    target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs]
+    positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
+
+    pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl'
+    if os.path.exists(pickle_file):
+        print(f'loading pickle file {pickle_file}')
+        probabilities = pickle.load(open(pickle_file, 'rb'))
+    else:
+        print(f'generating pickle file')
+        n_full_docs = len(positive) + len(negative)
+
+        feature_extractor = FeatureExtractor(function_words_freq='latin',
+                                             conjugations_freq='latin',
+                                             features_Mendenhall=True,
+                                             features_sentenceLengths=True,
+                                             tfidf_feat_selection_ratio=0.1,
+                                             wordngrams=True, n_wordngrams=(1, 2),
+                                             charngrams=True, n_charngrams=(3, 4, 5),
+                                             preserve_punctuation=False,
+                                             split_documents=True, split_policy=split_by_sentences, window_size=3,
+                                             normalize_features=True)
+
+        Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
+        print(ytr)
+
+        print('Fitting the Verificator')
+        av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name='Dante')
+        av.fit(Xtr,ytr,groups)
+
+        probabilities = []
+        for i, target_text in enumerate(ep_texts):
+            ep = feature_extractor.transform(target_text, avoid_splitting=True)
+            prob, _ = av.predict_proba(ep, epistola_name=target[i])
+            probabilities.append(prob)
+
+        pickle.dump(probabilities, open(pickle_file, 'wb'), pickle.HIGHEST_PROTOCOL)
+
+    color(path=f'../dante_color/epistola{epistola}_xiv.html', texts=ep_texts,
+          probabilities=probabilities, title=f'Epistola {epistola}',
+          paragraph_offset=paragraphs[0])
+
+
+    # print('Predicting the Epistola {}'.format(epistola))
+    # title = 'Epistola {}'.format('I' if epistola==1 else 'II')
+    # av.predict(ep, title)
+    # fulldoc_prob, fragment_probs = av.predict_proba(ep, title)
+    # color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title)
+
+    # score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False)
+    # print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std))
+
+    # score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True)
+    # print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
+    # f1_ = f1_from_counters(tp, fp, fn, tn)
+    # print('F1 = {:.3f}'.format(f1_))
+
+    # score_ave, score_std = av.leave_one_out(Xtr, ytr, None)
+    # print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std))
+
--- a/src/util/colormap.py
+++ b/src/util/colormap.py
@ -0,0 +1,35 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+# Have colormaps separated into categories:
+# http://matplotlib.org/examples/color/colormaps_reference.html
+cmaps = [('Diverging', ['RdYlGn','RdYlGn']),]
+
+
+nrows = max(len(cmap_list) for cmap_category, cmap_list in cmaps)
+gradient = np.linspace(0.25, 0.75, 256)
+gradient = np.vstack((gradient, gradient))
+
+
+def plot_color_gradients(cmap_category, cmap_list, nrows):
+    fig, axes = plt.subplots(nrows=nrows)
+    fig.subplots_adjust(top=0.95, bottom=0.01, left=0.2, right=0.99)
+    axes[0].set_title(cmap_category + ' colormaps', fontsize=14)
+
+    for ax, name in zip(axes, cmap_list):
+        ax.imshow(gradient, aspect='auto', cmap=plt.get_cmap(name))
+        pos = list(ax.get_position().bounds)
+        x_text = pos[0] - 0.01
+        y_text = pos[1] + pos[3]/2.
+        fig.text(x_text, y_text, name, va='center', ha='right', fontsize=10)
+
+    # Turn off *all* ticks & spines, not just the ones with colormaps.
+    for ax in axes:
+        ax.set_axis_off()
+
+
+for cmap_category, cmap_list in cmaps:
+    plot_color_gradients(cmap_category, cmap_list, nrows)
+
+plt.show()
--- a/src/weight_inspection.py
+++ b/src/weight_inspection.py
@ -0,0 +1,59 @@
+from sklearn.linear_model import LogisticRegression
+from data.dante_loader import load_texts
+from data.features import *
+from model import AuthorshipVerificator, f1_from_counters
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+
+for epistola in [2]:
+
+    author_attribution = []
+    print(f'Epistola {epistola}')
+    print('='*80)
+    path = f'../testi_{epistola}'
+    if epistola==2: path+='_tutti'
+
+    author = 'Dante'
+    print('=' * 80)
+    print('Corpus of Epistola {}'.format(epistola))
+    print('=' * 80)
+
+    positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=f'EpistolaXIII_{epistola}.txt')
+
+    n_full_docs = len(positive) + len(negative)
+
+    feature_extractor = FeatureExtractor(function_words_freq='latin',
+                                         conjugations_freq='latin',
+                                         features_Mendenhall=True,
+                                         features_sentenceLengths=True,
+                                         tfidf_feat_selection_ratio=0.1,
+                                         wordngrams=True, n_wordngrams=(1, 2),
+                                         charngrams=True, n_charngrams=(3, 4, 5),
+                                         preserve_punctuation=False,
+                                         split_documents=True, split_policy=split_by_sentences, window_size=3,
+                                         normalize_features=True)
+
+    Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
+
+    print('Fitting the Verificator')
+    av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
+    av.fit(Xtr, ytr, groups)
+
+    feat_rank = np.argsort(av.estimator.coef_[0])
+    coef_ordered = av.estimator.coef_[0][feat_rank]
+    feat_name_ordered = feature_extractor.feature_names[feat_rank]
+
+    print('Most Dantesque features::')
+    for i in range(100):
+        print(f'{i}: {feat_name_ordered[::-1][i]} {coef_ordered[::-1][i]:.3f}')
+
+    print('\nMost Non-Dantesque features::')
+    for i in range(100):
+        print(f'{i}: {feat_name_ordered[i]} {coef_ordered[i]:.3f}')
+
+
+    print('done')
+
+
+