more cleaning
This commit is contained in:
parent
3fe91df7a9
commit
e9e93ac3f0
|
|
@ -0,0 +1,114 @@
|
|||
from sklearn.linear_model import LogisticRegression
|
||||
from data.dante_loader import load_texts
|
||||
from data.features import *
|
||||
from model import AuthorshipVerificator, f1_from_counters
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def plot_attribution(path, authors, attributions, paragraph_offset=1):
|
||||
|
||||
paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[1]-1)]
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
im = ax.imshow(attributions)
|
||||
|
||||
# We want to show all ticks...
|
||||
ax.set_xticks(np.arange(len(paragraphs)))
|
||||
ax.set_yticks(np.arange(len(authors)))
|
||||
# ... and label them with the respective list entries
|
||||
ax.set_xticklabels(paragraphs)
|
||||
ax.set_yticklabels(authors)
|
||||
|
||||
# Rotate the tick labels and set their alignment.
|
||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
||||
rotation_mode="anchor")
|
||||
|
||||
# Loop over data dimensions and create text annotations.
|
||||
for i in range(len(authors)):
|
||||
for j in range(len(paragraphs)):
|
||||
text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
|
||||
|
||||
ax.set_title("Attribution matrix")
|
||||
fig.tight_layout()
|
||||
# plt.show()
|
||||
plt.savefig(path)
|
||||
|
||||
import sys
|
||||
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||
attributions = np.load('attribution_ep1.npy')
|
||||
plot_attribution('plot1.pdf', authors, attributions)
|
||||
sys.exit(0)
|
||||
|
||||
author_attribution = []
|
||||
for epistola in [1]:
|
||||
|
||||
print(f'Epistola {epistola}')
|
||||
print('='*80)
|
||||
path = f'../testi_{epistola}'
|
||||
|
||||
if epistola == 1:
|
||||
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||
paragraphs = range(1,3)
|
||||
|
||||
else:
|
||||
authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
|
||||
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
||||
'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
||||
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna',
|
||||
'PietroAlighieri', 'RaimundusLullus',
|
||||
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
||||
paragraphs = range(13, 90)
|
||||
|
||||
discarded = 0
|
||||
f1_scores = []
|
||||
counters = []
|
||||
for i, author in enumerate(authors):
|
||||
print('=' * 80)
|
||||
print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
|
||||
print('Corpus of Epistola {}'.format(epistola))
|
||||
print('=' * 80)
|
||||
|
||||
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
|
||||
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
|
||||
if len(positive) < 2:
|
||||
discarded += 1
|
||||
continue
|
||||
|
||||
n_full_docs = len(positive) + len(negative)
|
||||
|
||||
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||
conjugations_freq='latin',
|
||||
features_Mendenhall=True,
|
||||
features_sentenceLengths=True,
|
||||
tfidf_feat_selection_ratio=0.1,
|
||||
wordngrams=True, n_wordngrams=(1, 2),
|
||||
charngrams=True, n_charngrams=(3, 4, 5),
|
||||
preserve_punctuation=False,
|
||||
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||
normalize_features=True)
|
||||
|
||||
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||
|
||||
print('Fitting the Verificator')
|
||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
|
||||
av.fit(Xtr, ytr, groups)
|
||||
|
||||
attributions=[]
|
||||
for i,target_text in enumerate(ep_texts):
|
||||
ep = feature_extractor.transform(target_text, avoid_splitting=True)
|
||||
prob,_ = av.predict_proba(ep, epistola_name=target[i])
|
||||
attributions.append(prob)
|
||||
author_attribution.append(attributions)
|
||||
|
||||
author_attribution = np.asarray(author_attribution)
|
||||
attribution_path = f'attribution_ep{epistola}.npy'
|
||||
print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}')
|
||||
np.save(attribution_path, author_attribution)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Binary file not shown.
Binary file not shown.
|
|
@ -49,9 +49,15 @@ def load_texts(path, positive_author='Dante', unknown_target=None):
|
|||
|
||||
# load the test data (Epistolas 1 and 2)
|
||||
if unknown_target:
|
||||
unknown = open(join(path, unknown_target), encoding="utf8").read()
|
||||
unknown = remove_citations(unknown)
|
||||
return positive, negative, unknown
|
||||
if isinstance(unknown_target, str):
|
||||
unknown_target = [unknown_target]
|
||||
unknowns = []
|
||||
for unknown_text in unknown_target:
|
||||
unknown = open(join(path, unknown_text), encoding="utf8").read()
|
||||
unknown = remove_citations(unknown)
|
||||
unknowns.append(unknown)
|
||||
if len(unknowns) == 1: unknowns = unknowns[0]
|
||||
return positive, negative, unknowns
|
||||
|
||||
else:
|
||||
return positive, negative
|
||||
|
|
|
|||
|
|
@ -387,12 +387,12 @@ class FeatureExtractor:
|
|||
return X, y, groups
|
||||
|
||||
|
||||
def transform(self, test, return_fragments=False, window_size=-1):
|
||||
def transform(self, test, return_fragments=False, window_size=-1, avoid_splitting=False):
|
||||
test = [test]
|
||||
if window_size==-1:
|
||||
window_size = self.window_size
|
||||
|
||||
if self.split_documents:
|
||||
if self.split_documents and not avoid_splitting:
|
||||
tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size)
|
||||
test.extend(tests)
|
||||
|
||||
|
|
|
|||
|
|
@ -39,9 +39,11 @@ class AuthorshipVerificator:
|
|||
|
||||
def __init__(self, nfolds=10,
|
||||
params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]},
|
||||
estimator=SVC):
|
||||
estimator=SVC,
|
||||
author_name=None):
|
||||
self.nfolds = nfolds
|
||||
self.params = params
|
||||
self.author_name = author_name if author_name else 'this author'
|
||||
if estimator is SVC:
|
||||
self.params['kernel'] = ['linear', 'rbf']
|
||||
self.probability = True
|
||||
|
|
@ -117,7 +119,7 @@ class AuthorshipVerificator:
|
|||
assert self.probability, 'svm is not calibrated'
|
||||
pred = self.estimator.predict_proba(test)
|
||||
full_doc_prediction = pred[0,1]
|
||||
print('{} is from the same author: {}'.format(epistola_name, full_doc_prediction))
|
||||
print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
|
||||
if len(pred) > 1:
|
||||
fragment_predictions = pred[1:,1]
|
||||
print('fragments average {:.3f}, array={}'.format(fragment_predictions.mean(), fragment_predictions))
|
||||
|
|
|
|||
BIN
~$perimenti.docx
BIN
~$perimenti.docx
Binary file not shown.
BIN
~WRL3794.tmp
BIN
~WRL3794.tmp
Binary file not shown.
Loading…
Reference in New Issue