more cleaning

This commit is contained in:
Alejandro Moreo Fernandez 2019-05-08 10:04:02 +02:00
parent 3fe91df7a9
commit e9e93ac3f0
8 changed files with 129 additions and 7 deletions

114
src/author_attribution.py Normal file
View File

@ -0,0 +1,114 @@
from sklearn.linear_model import LogisticRegression
from data.dante_loader import load_texts
from data.features import *
from model import AuthorshipVerificator, f1_from_counters
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
def plot_attribution(path, authors, attributions, paragraph_offset=1):
paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[1]-1)]
fig, ax = plt.subplots()
im = ax.imshow(attributions)
# We want to show all ticks...
ax.set_xticks(np.arange(len(paragraphs)))
ax.set_yticks(np.arange(len(authors)))
# ... and label them with the respective list entries
ax.set_xticklabels(paragraphs)
ax.set_yticklabels(authors)
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
for i in range(len(authors)):
for j in range(len(paragraphs)):
text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
ax.set_title("Attribution matrix")
fig.tight_layout()
# plt.show()
plt.savefig(path)
import sys
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
attributions = np.load('attribution_ep1.npy')
plot_attribution('plot1.pdf', authors, attributions)
sys.exit(0)
author_attribution = []
for epistola in [1]:
print(f'Epistola {epistola}')
print('='*80)
path = f'../testi_{epistola}'
if epistola == 1:
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
paragraphs = range(1,3)
else:
authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
'GrazioloBambaglioli', 'GuidoDaPisa',
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna',
'PietroAlighieri', 'RaimundusLullus',
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
paragraphs = range(13, 90)
discarded = 0
f1_scores = []
counters = []
for i, author in enumerate(authors):
print('=' * 80)
print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
print('Corpus of Epistola {}'.format(epistola))
print('=' * 80)
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
if len(positive) < 2:
discarded += 1
continue
n_full_docs = len(positive) + len(negative)
feature_extractor = FeatureExtractor(function_words_freq='latin',
conjugations_freq='latin',
features_Mendenhall=True,
features_sentenceLengths=True,
tfidf_feat_selection_ratio=0.1,
wordngrams=True, n_wordngrams=(1, 2),
charngrams=True, n_charngrams=(3, 4, 5),
preserve_punctuation=False,
split_documents=True, split_policy=split_by_sentences, window_size=3,
normalize_features=True)
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
print('Fitting the Verificator')
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
av.fit(Xtr, ytr, groups)
attributions=[]
for i,target_text in enumerate(ep_texts):
ep = feature_extractor.transform(target_text, avoid_splitting=True)
prob,_ = av.predict_proba(ep, epistola_name=target[i])
attributions.append(prob)
author_attribution.append(attributions)
author_attribution = np.asarray(author_attribution)
attribution_path = f'attribution_ep{epistola}.npy'
print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}')
np.save(attribution_path, author_attribution)

View File

@ -49,9 +49,15 @@ def load_texts(path, positive_author='Dante', unknown_target=None):
# load the test data (Epistolas 1 and 2)
if unknown_target:
unknown = open(join(path, unknown_target), encoding="utf8").read()
unknown = remove_citations(unknown)
return positive, negative, unknown
if isinstance(unknown_target, str):
unknown_target = [unknown_target]
unknowns = []
for unknown_text in unknown_target:
unknown = open(join(path, unknown_text), encoding="utf8").read()
unknown = remove_citations(unknown)
unknowns.append(unknown)
if len(unknowns) == 1: unknowns = unknowns[0]
return positive, negative, unknowns
else:
return positive, negative

View File

@ -387,12 +387,12 @@ class FeatureExtractor:
return X, y, groups
def transform(self, test, return_fragments=False, window_size=-1):
def transform(self, test, return_fragments=False, window_size=-1, avoid_splitting=False):
test = [test]
if window_size==-1:
window_size = self.window_size
if self.split_documents:
if self.split_documents and not avoid_splitting:
tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size)
test.extend(tests)

View File

@ -39,9 +39,11 @@ class AuthorshipVerificator:
def __init__(self, nfolds=10,
params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]},
estimator=SVC):
estimator=SVC,
author_name=None):
self.nfolds = nfolds
self.params = params
self.author_name = author_name if author_name else 'this author'
if estimator is SVC:
self.params['kernel'] = ['linear', 'rbf']
self.probability = True
@ -117,7 +119,7 @@ class AuthorshipVerificator:
assert self.probability, 'svm is not calibrated'
pred = self.estimator.predict_proba(test)
full_doc_prediction = pred[0,1]
print('{} is from the same author: {}'.format(epistola_name, full_doc_prediction))
print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
if len(pred) > 1:
fragment_predictions = pred[1:,1]
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))

Binary file not shown.

Binary file not shown.