more cleaning

2019-05-08 10:04:02 +02:00 · 2019-05-08 10:04:02 +02:00 · e9e93ac3f0
parent 3fe91df7a9
commit e9e93ac3f0
8 changed files with 129 additions and 7 deletions
--- a/src/author_attribution.py
+++ b/src/author_attribution.py
@ -0,0 +1,114 @@
 from sklearn.linear_model import LogisticRegression
 from data.dante_loader import load_texts
 from data.features import *
 from model import AuthorshipVerificator, f1_from_counters
 import numpy as np
 import matplotlib
 import matplotlib.pyplot as plt
 def plot_attribution(path, authors, attributions, paragraph_offset=1):
    paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[1]-1)]
    fig, ax = plt.subplots()
    im = ax.imshow(attributions)
    # We want to show all ticks...
    ax.set_xticks(np.arange(len(paragraphs)))
    ax.set_yticks(np.arange(len(authors)))
    # ... and label them with the respective list entries
    ax.set_xticklabels(paragraphs)
    ax.set_yticklabels(authors)
    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    # Loop over data dimensions and create text annotations.
    for i in range(len(authors)):
        for j in range(len(paragraphs)):
            text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
    ax.set_title("Attribution matrix")
    fig.tight_layout()
    # plt.show()
    plt.savefig(path)
 import sys
 authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
 attributions = np.load('attribution_ep1.npy')
 plot_attribution('plot1.pdf', authors, attributions)
 sys.exit(0)
 author_attribution = []
 for epistola in [1]:
    print(f'Epistola {epistola}')
    print('='*80)
    path = f'../testi_{epistola}'
    if epistola == 1:
        authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
        paragraphs = range(1,3)
    else:
        authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
                   'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
                   'GrazioloBambaglioli', 'GuidoDaPisa',
                   'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
                   'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna',
                   'PietroAlighieri', 'RaimundusLullus',
                   'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
        paragraphs = range(13, 90)
    discarded = 0
    f1_scores = []
    counters = []
    for i, author in enumerate(authors):
        print('=' * 80)
        print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
        print('Corpus of Epistola {}'.format(epistola))
        print('=' * 80)
        target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
        positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
        if len(positive) < 2:
            discarded += 1
            continue
        n_full_docs = len(positive) + len(negative)
        feature_extractor = FeatureExtractor(function_words_freq='latin',
                                             conjugations_freq='latin',
                                             features_Mendenhall=True,
                                             features_sentenceLengths=True,
                                             tfidf_feat_selection_ratio=0.1,
                                             wordngrams=True, n_wordngrams=(1, 2),
                                             charngrams=True, n_charngrams=(3, 4, 5),
                                             preserve_punctuation=False,
                                             split_documents=True, split_policy=split_by_sentences, window_size=3,
                                             normalize_features=True)
        Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
        print('Fitting the Verificator')
        av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
        av.fit(Xtr, ytr, groups)
        attributions=[]
        for i,target_text in enumerate(ep_texts):
            ep = feature_extractor.transform(target_text, avoid_splitting=True)
            prob,_ = av.predict_proba(ep, epistola_name=target[i])
            attributions.append(prob)
        author_attribution.append(attributions)
    author_attribution = np.asarray(author_attribution)
    attribution_path = f'attribution_ep{epistola}.npy'
    print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}')
    np.save(attribution_path, author_attribution)
--- a/src/data/pycache/dante_loader.cpython-36.pyc
+++ b/src/data/pycache/dante_loader.cpython-36.pyc
--- a/src/data/pycache/features.cpython-36.pyc
+++ b/src/data/pycache/features.cpython-36.pyc
--- a/src/data/dante_loader.py
+++ b/src/data/dante_loader.py
@ -49,9 +49,15 @@ def load_texts(path, positive_author='Dante', unknown_target=None):
    # load the test data (Epistolas 1 and 2)
    if unknown_target:
-        unknown = open(join(path, unknown_target), encoding="utf8").read()
+        if isinstance(unknown_target, str):
-        unknown = remove_citations(unknown)
+            unknown_target = [unknown_target]
-        return positive, negative, unknown
+        unknowns = []
        for unknown_text in unknown_target:
            unknown = open(join(path, unknown_text), encoding="utf8").read()
            unknown = remove_citations(unknown)
            unknowns.append(unknown)
        if len(unknowns) == 1: unknowns = unknowns[0]
        return positive, negative, unknowns
    else:
        return positive, negative
--- a/src/data/features.py
+++ b/src/data/features.py
@ -387,12 +387,12 @@ class FeatureExtractor:
        return X, y, groups
-    def transform(self, test, return_fragments=False, window_size=-1):
+    def transform(self, test, return_fragments=False, window_size=-1, avoid_splitting=False):
        test = [test]
        if window_size==-1:
            window_size = self.window_size
-        if self.split_documents:
+        if self.split_documents and not avoid_splitting:
            tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size)
            test.extend(tests)
--- a/src/model.py
+++ b/src/model.py
@ -39,9 +39,11 @@ class AuthorshipVerificator:
    def __init__(self, nfolds=10,
                 params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]},
-                 estimator=SVC):
+                 estimator=SVC,
                 author_name=None):
        self.nfolds = nfolds
        self.params = params
        self.author_name = author_name if author_name else 'this author'
        if estimator is SVC:
            self.params['kernel'] = ['linear', 'rbf']
            self.probability = True
@ -117,7 +119,7 @@ class AuthorshipVerificator:
        assert self.probability, 'svm is not calibrated'
        pred = self.estimator.predict_proba(test)
        full_doc_prediction = pred[0,1]
-        print('{} is from the same author: {}'.format(epistola_name, full_doc_prediction))
+        print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
        if len(pred) > 1:
            fragment_predictions = pred[1:,1]
            print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
--- a/~$perimenti.docx
+++ b/~$perimenti.docx
--- a/~WRL3794.tmp
+++ b/~WRL3794.tmp