print missclassified
This commit is contained in:
parent
56770446bd
commit
23e62162b5
|
|
@ -59,29 +59,29 @@ def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5
|
|||
# plt.show()
|
||||
plt.savefig(path)
|
||||
|
||||
import sys
|
||||
for epistola in [1]:
|
||||
if epistola == 1:
|
||||
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||
paragraph_offset = 1
|
||||
figsize=(3,9)
|
||||
label_offset=0.2
|
||||
|
||||
else:
|
||||
authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
|
||||
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
||||
'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
||||
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
|
||||
'PietroAlighieri', 'RaimundusLullus',
|
||||
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
||||
paragraph_offset = 14
|
||||
figsize = (6,20)
|
||||
label_offset=0.3
|
||||
|
||||
attributions = np.load(f'attribution_ep{epistola}.npy')
|
||||
plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
|
||||
sys.exit(0)
|
||||
# import sys
|
||||
# for epistola in [1]:
|
||||
# if epistola == 1:
|
||||
# authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||
# paragraph_offset = 1
|
||||
# figsize=(3,9)
|
||||
# label_offset=0.2
|
||||
#
|
||||
# else:
|
||||
# authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
|
||||
# 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
||||
# 'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||
# 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
||||
# 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
|
||||
# 'PietroAlighieri', 'RaimundusLullus',
|
||||
# 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
||||
# paragraph_offset = 14
|
||||
# figsize = (6,20)
|
||||
# label_offset=0.3
|
||||
#
|
||||
# attributions = np.load(f'attribution_ep{epistola}.npy')
|
||||
# plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
|
||||
# sys.exit(0)
|
||||
|
||||
for epistola in [1]:
|
||||
|
||||
|
|
@ -116,7 +116,7 @@ for epistola in [1]:
|
|||
print('=' * 80)
|
||||
|
||||
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
|
||||
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
|
||||
positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
|
||||
# if len(positive) < 2:
|
||||
# discarded += 1
|
||||
# continue
|
||||
|
|
|
|||
|
|
@ -111,7 +111,7 @@ for epistola in [1]:
|
|||
print('=' * 80)
|
||||
|
||||
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)]
|
||||
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
|
||||
positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
|
||||
|
||||
n_full_docs = len(positive) + len(negative)
|
||||
|
||||
|
|
|
|||
|
|
@ -37,7 +37,8 @@ for epistola in [1]:
|
|||
if epistola==2:
|
||||
path+='_interaEpistola'
|
||||
|
||||
positive, negative, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
|
||||
positive, negative, pos_files, neg_files, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
|
||||
files = np.asarray(pos_files + neg_files)
|
||||
if len(positive) < 2:
|
||||
discarded+=1
|
||||
continue
|
||||
|
|
@ -55,6 +56,7 @@ for epistola in [1]:
|
|||
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||
normalize_features=True)
|
||||
|
||||
|
||||
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
||||
print(ytr)
|
||||
|
||||
|
|
@ -64,7 +66,7 @@ for epistola in [1]:
|
|||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
|
||||
av.fit(Xtr,ytr,groups)
|
||||
|
||||
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True)
|
||||
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True)
|
||||
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
|
||||
counters.append((tp, fp, fn, tn))
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ import os
|
|||
# TODO: sentence length (Mendenhall-style) ?
|
||||
|
||||
|
||||
for epistola in [1]:
|
||||
for epistola in [2]:
|
||||
|
||||
print('Epistola {}'.format(epistola))
|
||||
print('='*80)
|
||||
|
|
@ -26,7 +26,7 @@ for epistola in [1]:
|
|||
paragraphs = range(14, 91)
|
||||
|
||||
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
|
||||
positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target)
|
||||
positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target)
|
||||
|
||||
pickle_file = f'../dante_color/epistola{epistola}.pkl'
|
||||
if os.path.exists(pickle_file):
|
||||
|
|
@ -35,6 +35,7 @@ for epistola in [1]:
|
|||
for prob,text in zip(probabilities,ep_texts):
|
||||
text = text.replace('\n','')
|
||||
print(f"{prob:.3f}:{text}")
|
||||
print(f'media={np.asarray(probabilities[1:]).mean()}')
|
||||
else:
|
||||
print(f'generating pickle file')
|
||||
n_full_docs = len(positive) + len(negative)
|
||||
|
|
@ -50,6 +51,7 @@ for epistola in [1]:
|
|||
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||
normalize_features=True)
|
||||
|
||||
|
||||
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
||||
print(ytr)
|
||||
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ for epistola in [1,2,3]: #3 means "both Ep1 and Ep2 corpora"
|
|||
paragraphs = range(1, 6)
|
||||
|
||||
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs]
|
||||
positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
|
||||
positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
|
||||
|
||||
pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl'
|
||||
if os.path.exists(pickle_file):
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@ from os.path import join
|
|||
import re
|
||||
import collections
|
||||
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# document loading routine
|
||||
# ------------------------------------------------------------------------
|
||||
|
|
@ -30,7 +32,8 @@ def remove_citations(doc):
|
|||
|
||||
def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_prefix='EpistolaXIII_'):
|
||||
# load the training data (all documents but Epistolas 1 and 2)
|
||||
positive,negative = [],[]
|
||||
positive, negative = [], []
|
||||
files_positive, files_negative = [], []
|
||||
authors = []
|
||||
ndocs=0
|
||||
for file in os.listdir(path):
|
||||
|
|
@ -42,8 +45,10 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr
|
|||
|
||||
if author == positive_author:
|
||||
positive.append(text)
|
||||
files_positive.append(file)
|
||||
else:
|
||||
negative.append(text)
|
||||
files_negative.append(file)
|
||||
authors.append(author)
|
||||
ndocs+=1
|
||||
|
||||
|
|
@ -57,10 +62,10 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr
|
|||
unknown = remove_citations(unknown)
|
||||
unknowns.append(unknown)
|
||||
if len(unknowns) == 1: unknowns = unknowns[0]
|
||||
return positive, negative, unknowns
|
||||
return positive, negative, files_positive, files_negative, unknowns
|
||||
|
||||
else:
|
||||
return positive, negative
|
||||
return positive, negative, files_positive, files_negative
|
||||
|
||||
|
||||
def ___list_texts(path):
|
||||
|
|
|
|||
|
|
@ -80,7 +80,7 @@ class AuthorshipVerificator:
|
|||
|
||||
return self
|
||||
|
||||
def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True, counters=False):
|
||||
def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
|
||||
|
||||
if groups is None:
|
||||
print('Computing LOO without groups')
|
||||
|
|
@ -94,7 +94,10 @@ class AuthorshipVerificator:
|
|||
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
|
||||
|
||||
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
||||
missclassified = '\n'.join(files[scores==0].tolist())
|
||||
print(scores)
|
||||
print(missclassified)
|
||||
|
||||
if counters and test_lowest_index_only:
|
||||
yfull_true = y[:len(folds)]
|
||||
yfull_predict = np.zeros_like(yfull_true)
|
||||
|
|
|
|||
Loading…
Reference in New Issue