more cleaning
This commit is contained in:
parent
3fe91df7a9
commit
e9e93ac3f0
|
|
@ -0,0 +1,114 @@
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from data.dante_loader import load_texts
|
||||||
|
from data.features import *
|
||||||
|
from model import AuthorshipVerificator, f1_from_counters
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
def plot_attribution(path, authors, attributions, paragraph_offset=1):
|
||||||
|
|
||||||
|
paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[1]-1)]
|
||||||
|
|
||||||
|
fig, ax = plt.subplots()
|
||||||
|
im = ax.imshow(attributions)
|
||||||
|
|
||||||
|
# We want to show all ticks...
|
||||||
|
ax.set_xticks(np.arange(len(paragraphs)))
|
||||||
|
ax.set_yticks(np.arange(len(authors)))
|
||||||
|
# ... and label them with the respective list entries
|
||||||
|
ax.set_xticklabels(paragraphs)
|
||||||
|
ax.set_yticklabels(authors)
|
||||||
|
|
||||||
|
# Rotate the tick labels and set their alignment.
|
||||||
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||||
|
rotation_mode="anchor")
|
||||||
|
|
||||||
|
# Loop over data dimensions and create text annotations.
|
||||||
|
for i in range(len(authors)):
|
||||||
|
for j in range(len(paragraphs)):
|
||||||
|
text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
|
||||||
|
|
||||||
|
ax.set_title("Attribution matrix")
|
||||||
|
fig.tight_layout()
|
||||||
|
# plt.show()
|
||||||
|
plt.savefig(path)
|
||||||
|
|
||||||
|
import sys
|
||||||
|
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||||
|
attributions = np.load('attribution_ep1.npy')
|
||||||
|
plot_attribution('plot1.pdf', authors, attributions)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
author_attribution = []
|
||||||
|
for epistola in [1]:
|
||||||
|
|
||||||
|
print(f'Epistola {epistola}')
|
||||||
|
print('='*80)
|
||||||
|
path = f'../testi_{epistola}'
|
||||||
|
|
||||||
|
if epistola == 1:
|
||||||
|
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||||
|
paragraphs = range(1,3)
|
||||||
|
|
||||||
|
else:
|
||||||
|
authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
|
||||||
|
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
||||||
|
'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||||
|
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
||||||
|
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna',
|
||||||
|
'PietroAlighieri', 'RaimundusLullus',
|
||||||
|
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
||||||
|
paragraphs = range(13, 90)
|
||||||
|
|
||||||
|
discarded = 0
|
||||||
|
f1_scores = []
|
||||||
|
counters = []
|
||||||
|
for i, author in enumerate(authors):
|
||||||
|
print('=' * 80)
|
||||||
|
print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
|
||||||
|
print('Corpus of Epistola {}'.format(epistola))
|
||||||
|
print('=' * 80)
|
||||||
|
|
||||||
|
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
|
||||||
|
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
|
||||||
|
if len(positive) < 2:
|
||||||
|
discarded += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
n_full_docs = len(positive) + len(negative)
|
||||||
|
|
||||||
|
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||||
|
conjugations_freq='latin',
|
||||||
|
features_Mendenhall=True,
|
||||||
|
features_sentenceLengths=True,
|
||||||
|
tfidf_feat_selection_ratio=0.1,
|
||||||
|
wordngrams=True, n_wordngrams=(1, 2),
|
||||||
|
charngrams=True, n_charngrams=(3, 4, 5),
|
||||||
|
preserve_punctuation=False,
|
||||||
|
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||||
|
normalize_features=True)
|
||||||
|
|
||||||
|
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||||
|
|
||||||
|
print('Fitting the Verificator')
|
||||||
|
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
|
||||||
|
av.fit(Xtr, ytr, groups)
|
||||||
|
|
||||||
|
attributions=[]
|
||||||
|
for i,target_text in enumerate(ep_texts):
|
||||||
|
ep = feature_extractor.transform(target_text, avoid_splitting=True)
|
||||||
|
prob,_ = av.predict_proba(ep, epistola_name=target[i])
|
||||||
|
attributions.append(prob)
|
||||||
|
author_attribution.append(attributions)
|
||||||
|
|
||||||
|
author_attribution = np.asarray(author_attribution)
|
||||||
|
attribution_path = f'attribution_ep{epistola}.npy'
|
||||||
|
print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}')
|
||||||
|
np.save(attribution_path, author_attribution)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -49,9 +49,15 @@ def load_texts(path, positive_author='Dante', unknown_target=None):
|
||||||
|
|
||||||
# load the test data (Epistolas 1 and 2)
|
# load the test data (Epistolas 1 and 2)
|
||||||
if unknown_target:
|
if unknown_target:
|
||||||
unknown = open(join(path, unknown_target), encoding="utf8").read()
|
if isinstance(unknown_target, str):
|
||||||
unknown = remove_citations(unknown)
|
unknown_target = [unknown_target]
|
||||||
return positive, negative, unknown
|
unknowns = []
|
||||||
|
for unknown_text in unknown_target:
|
||||||
|
unknown = open(join(path, unknown_text), encoding="utf8").read()
|
||||||
|
unknown = remove_citations(unknown)
|
||||||
|
unknowns.append(unknown)
|
||||||
|
if len(unknowns) == 1: unknowns = unknowns[0]
|
||||||
|
return positive, negative, unknowns
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return positive, negative
|
return positive, negative
|
||||||
|
|
|
||||||
|
|
@ -387,12 +387,12 @@ class FeatureExtractor:
|
||||||
return X, y, groups
|
return X, y, groups
|
||||||
|
|
||||||
|
|
||||||
def transform(self, test, return_fragments=False, window_size=-1):
|
def transform(self, test, return_fragments=False, window_size=-1, avoid_splitting=False):
|
||||||
test = [test]
|
test = [test]
|
||||||
if window_size==-1:
|
if window_size==-1:
|
||||||
window_size = self.window_size
|
window_size = self.window_size
|
||||||
|
|
||||||
if self.split_documents:
|
if self.split_documents and not avoid_splitting:
|
||||||
tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size)
|
tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size)
|
||||||
test.extend(tests)
|
test.extend(tests)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -39,9 +39,11 @@ class AuthorshipVerificator:
|
||||||
|
|
||||||
def __init__(self, nfolds=10,
|
def __init__(self, nfolds=10,
|
||||||
params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]},
|
params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]},
|
||||||
estimator=SVC):
|
estimator=SVC,
|
||||||
|
author_name=None):
|
||||||
self.nfolds = nfolds
|
self.nfolds = nfolds
|
||||||
self.params = params
|
self.params = params
|
||||||
|
self.author_name = author_name if author_name else 'this author'
|
||||||
if estimator is SVC:
|
if estimator is SVC:
|
||||||
self.params['kernel'] = ['linear', 'rbf']
|
self.params['kernel'] = ['linear', 'rbf']
|
||||||
self.probability = True
|
self.probability = True
|
||||||
|
|
@ -117,7 +119,7 @@ class AuthorshipVerificator:
|
||||||
assert self.probability, 'svm is not calibrated'
|
assert self.probability, 'svm is not calibrated'
|
||||||
pred = self.estimator.predict_proba(test)
|
pred = self.estimator.predict_proba(test)
|
||||||
full_doc_prediction = pred[0,1]
|
full_doc_prediction = pred[0,1]
|
||||||
print('{} is from the same author: {}'.format(epistola_name, full_doc_prediction))
|
print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
|
||||||
if len(pred) > 1:
|
if len(pred) > 1:
|
||||||
fragment_predictions = pred[1:,1]
|
fragment_predictions = pred[1:,1]
|
||||||
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
||||||
|
|
|
||||||
BIN
~$perimenti.docx
BIN
~$perimenti.docx
Binary file not shown.
BIN
~WRL3794.tmp
BIN
~WRL3794.tmp
Binary file not shown.
Loading…
Reference in New Issue