diff --git a/src/author_attribution.py b/src/author_attribution.py new file mode 100644 index 0000000..6de0ba9 --- /dev/null +++ b/src/author_attribution.py @@ -0,0 +1,114 @@ +from sklearn.linear_model import LogisticRegression +from data.dante_loader import load_texts +from data.features import * +from model import AuthorshipVerificator, f1_from_counters +import numpy as np +import matplotlib +import matplotlib.pyplot as plt + +def plot_attribution(path, authors, attributions, paragraph_offset=1): + + paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[1]-1)] + + fig, ax = plt.subplots() + im = ax.imshow(attributions) + + # We want to show all ticks... + ax.set_xticks(np.arange(len(paragraphs))) + ax.set_yticks(np.arange(len(authors))) + # ... and label them with the respective list entries + ax.set_xticklabels(paragraphs) + ax.set_yticklabels(authors) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", + rotation_mode="anchor") + + # Loop over data dimensions and create text annotations. + for i in range(len(authors)): + for j in range(len(paragraphs)): + text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w") + + ax.set_title("Attribution matrix") + fig.tight_layout() + # plt.show() + plt.savefig(path) + +import sys +authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna'] +attributions = np.load('attribution_ep1.npy') +plot_attribution('plot1.pdf', authors, attributions) +sys.exit(0) + +author_attribution = [] +for epistola in [1]: + + print(f'Epistola {epistola}') + print('='*80) + path = f'../testi_{epistola}' + + if epistola == 1: + authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna'] + paragraphs = range(1,3) + + else: + authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis', + 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', + 'GrazioloBambaglioli', 'GuidoDaPisa', + 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', + 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', + 'PietroAlighieri', 'RaimundusLullus', + 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] + paragraphs = range(13, 90) + + discarded = 0 + f1_scores = [] + counters = [] + for i, author in enumerate(authors): + print('=' * 80) + print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors))) + print('Corpus of Epistola {}'.format(epistola)) + print('=' * 80) + + target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs] + positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target) + if len(positive) < 2: + discarded += 1 + continue + + n_full_docs = len(positive) + len(negative) + + feature_extractor = FeatureExtractor(function_words_freq='latin', + conjugations_freq='latin', + features_Mendenhall=True, + features_sentenceLengths=True, + tfidf_feat_selection_ratio=0.1, + wordngrams=True, n_wordngrams=(1, 2), + charngrams=True, n_charngrams=(3, 4, 5), + preserve_punctuation=False, + split_documents=True, split_policy=split_by_sentences, window_size=3, + normalize_features=True) + + Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) + + print('Fitting the Verificator') + av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author) + av.fit(Xtr, ytr, groups) + + attributions=[] + for i,target_text in enumerate(ep_texts): + ep = feature_extractor.transform(target_text, avoid_splitting=True) + prob,_ = av.predict_proba(ep, epistola_name=target[i]) + attributions.append(prob) + author_attribution.append(attributions) + + author_attribution = np.asarray(author_attribution) + attribution_path = f'attribution_ep{epistola}.npy' + print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}') + np.save(attribution_path, author_attribution) + + + + + + diff --git a/src/data/__pycache__/dante_loader.cpython-36.pyc b/src/data/__pycache__/dante_loader.cpython-36.pyc deleted file mode 100644 index b67149f..0000000 Binary files a/src/data/__pycache__/dante_loader.cpython-36.pyc and /dev/null differ diff --git a/src/data/__pycache__/features.cpython-36.pyc b/src/data/__pycache__/features.cpython-36.pyc deleted file mode 100644 index 0adc9f3..0000000 Binary files a/src/data/__pycache__/features.cpython-36.pyc and /dev/null differ diff --git a/src/data/dante_loader.py b/src/data/dante_loader.py index 7ecc8f3..022d559 100644 --- a/src/data/dante_loader.py +++ b/src/data/dante_loader.py @@ -49,9 +49,15 @@ def load_texts(path, positive_author='Dante', unknown_target=None): # load the test data (Epistolas 1 and 2) if unknown_target: - unknown = open(join(path, unknown_target), encoding="utf8").read() - unknown = remove_citations(unknown) - return positive, negative, unknown + if isinstance(unknown_target, str): + unknown_target = [unknown_target] + unknowns = [] + for unknown_text in unknown_target: + unknown = open(join(path, unknown_text), encoding="utf8").read() + unknown = remove_citations(unknown) + unknowns.append(unknown) + if len(unknowns) == 1: unknowns = unknowns[0] + return positive, negative, unknowns else: return positive, negative diff --git a/src/data/features.py b/src/data/features.py index d30009b..230e8ae 100644 --- a/src/data/features.py +++ b/src/data/features.py @@ -387,12 +387,12 @@ class FeatureExtractor: return X, y, groups - def transform(self, test, return_fragments=False, window_size=-1): + def transform(self, test, return_fragments=False, window_size=-1, avoid_splitting=False): test = [test] if window_size==-1: window_size = self.window_size - if self.split_documents: + if self.split_documents and not avoid_splitting: tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size) test.extend(tests) diff --git a/src/model.py b/src/model.py index 38ae9a1..e2d1267 100644 --- a/src/model.py +++ b/src/model.py @@ -39,9 +39,11 @@ class AuthorshipVerificator: def __init__(self, nfolds=10, params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]}, - estimator=SVC): + estimator=SVC, + author_name=None): self.nfolds = nfolds self.params = params + self.author_name = author_name if author_name else 'this author' if estimator is SVC: self.params['kernel'] = ['linear', 'rbf'] self.probability = True @@ -117,7 +119,7 @@ class AuthorshipVerificator: assert self.probability, 'svm is not calibrated' pred = self.estimator.predict_proba(test) full_doc_prediction = pred[0,1] - print('{} is from the same author: {}'.format(epistola_name, full_doc_prediction)) + print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}') if len(pred) > 1: fragment_predictions = pred[1:,1] print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions)) diff --git a/~$perimenti.docx b/~$perimenti.docx deleted file mode 100644 index 7c3f6de..0000000 Binary files a/~$perimenti.docx and /dev/null differ diff --git a/~WRL3794.tmp b/~WRL3794.tmp deleted file mode 100644 index ad605a0..0000000 Binary files a/~WRL3794.tmp and /dev/null differ