more cleaning

2019-05-08 10:04:02 +02:00 · 2019-05-08 10:04:02 +02:00 · e9e93ac3f0
parent 3fe91df7a9
commit e9e93ac3f0
8 changed files with 129 additions and 7 deletions
--- a/src/author_attribution.py
+++ b/src/author_attribution.py
@ -0,0 +1,114 @@
+from sklearn.linear_model import LogisticRegression
+from data.dante_loader import load_texts
+from data.features import *
+from model import AuthorshipVerificator, f1_from_counters
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+
+def plot_attribution(path, authors, attributions, paragraph_offset=1):
+
+    paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[1]-1)]
+
+    fig, ax = plt.subplots()
+    im = ax.imshow(attributions)
+
+    # We want to show all ticks...
+    ax.set_xticks(np.arange(len(paragraphs)))
+    ax.set_yticks(np.arange(len(authors)))
+    # ... and label them with the respective list entries
+    ax.set_xticklabels(paragraphs)
+    ax.set_yticklabels(authors)
+
+    # Rotate the tick labels and set their alignment.
+    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
+             rotation_mode="anchor")
+
+    # Loop over data dimensions and create text annotations.
+    for i in range(len(authors)):
+        for j in range(len(paragraphs)):
+            text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
+
+    ax.set_title("Attribution matrix")
+    fig.tight_layout()
+    # plt.show()
+    plt.savefig(path)
+
+import sys
+authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
+attributions = np.load('attribution_ep1.npy')
+plot_attribution('plot1.pdf', authors, attributions)
+sys.exit(0)
+
+author_attribution = []
+for epistola in [1]:
+
+    print(f'Epistola {epistola}')
+    print('='*80)
+    path = f'../testi_{epistola}'
+
+    if epistola == 1:
+        authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
+        paragraphs = range(1,3)
+
+    else:
+        authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
+                   'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
+                   'GrazioloBambaglioli', 'GuidoDaPisa',
+                   'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
+                   'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna',
+                   'PietroAlighieri', 'RaimundusLullus',
+                   'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
+        paragraphs = range(13, 90)
+
+    discarded = 0
+    f1_scores = []
+    counters = []
+    for i, author in enumerate(authors):
+        print('=' * 80)
+        print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
+        print('Corpus of Epistola {}'.format(epistola))
+        print('=' * 80)
+
+        target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
+        positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
+        if len(positive) < 2:
+            discarded += 1
+            continue
+
+        n_full_docs = len(positive) + len(negative)
+
+        feature_extractor = FeatureExtractor(function_words_freq='latin',
+                                             conjugations_freq='latin',
+                                             features_Mendenhall=True,
+                                             features_sentenceLengths=True,
+                                             tfidf_feat_selection_ratio=0.1,
+                                             wordngrams=True, n_wordngrams=(1, 2),
+                                             charngrams=True, n_charngrams=(3, 4, 5),
+                                             preserve_punctuation=False,
+                                             split_documents=True, split_policy=split_by_sentences, window_size=3,
+                                             normalize_features=True)
+
+        Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
+
+        print('Fitting the Verificator')
+        av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
+        av.fit(Xtr, ytr, groups)
+
+        attributions=[]
+        for i,target_text in enumerate(ep_texts):
+            ep = feature_extractor.transform(target_text, avoid_splitting=True)
+            prob,_ = av.predict_proba(ep, epistola_name=target[i])
+            attributions.append(prob)
+        author_attribution.append(attributions)
+
+    author_attribution = np.asarray(author_attribution)
+    attribution_path = f'attribution_ep{epistola}.npy'
+    print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}')
+    np.save(attribution_path, author_attribution)
+
+
+
+
+
+
--- a/src/data/pycache/dante_loader.cpython-36.pyc
+++ b/src/data/pycache/dante_loader.cpython-36.pyc
--- a/src/data/pycache/features.cpython-36.pyc
+++ b/src/data/pycache/features.cpython-36.pyc
--- a/src/data/dante_loader.py
+++ b/src/data/dante_loader.py
@ -49,9 +49,15 @@ def load_texts(path, positive_author='Dante', unknown_target=None):

    # load the test data (Epistolas 1 and 2)
    if unknown_target:
-        unknown = open(join(path, unknown_target), encoding="utf8").read()
-        unknown = remove_citations(unknown)
-        return positive, negative, unknown
+        if isinstance(unknown_target, str):
+            unknown_target = [unknown_target]
+        unknowns = []
+        for unknown_text in unknown_target:
+            unknown = open(join(path, unknown_text), encoding="utf8").read()
+            unknown = remove_citations(unknown)
+            unknowns.append(unknown)
+        if len(unknowns) == 1: unknowns = unknowns[0]
+        return positive, negative, unknowns

    else:
        return positive, negative
--- a/src/data/features.py
+++ b/src/data/features.py
@ -387,12 +387,12 @@ class FeatureExtractor:
        return X, y, groups


-    def transform(self, test, return_fragments=False, window_size=-1):
+    def transform(self, test, return_fragments=False, window_size=-1, avoid_splitting=False):
        test = [test]
        if window_size==-1:
            window_size = self.window_size

-        if self.split_documents:
+        if self.split_documents and not avoid_splitting:
            tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size)
            test.extend(tests)

--- a/src/model.py
+++ b/src/model.py
@ -39,9 +39,11 @@ class AuthorshipVerificator:

    def __init__(self, nfolds=10,
                 params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]},
-                 estimator=SVC):
+                 estimator=SVC,
+                 author_name=None):
        self.nfolds = nfolds
        self.params = params
+        self.author_name = author_name if author_name else 'this author'
        if estimator is SVC:
            self.params['kernel'] = ['linear', 'rbf']
            self.probability = True
@ -117,7 +119,7 @@ class AuthorshipVerificator:
        assert self.probability, 'svm is not calibrated'
        pred = self.estimator.predict_proba(test)
        full_doc_prediction = pred[0,1]
-        print('{} is from the same author: {}'.format(epistola_name, full_doc_prediction))
+        print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
        if len(pred) > 1:
            fragment_predictions = pred[1:,1]
            print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
--- a/~$perimenti.docx
+++ b/~$perimenti.docx
--- a/~WRL3794.tmp
+++ b/~WRL3794.tmp