print missclassified
This commit is contained in:
parent
56770446bd
commit
23e62162b5
|
|
@ -59,29 +59,29 @@ def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5
|
||||||
# plt.show()
|
# plt.show()
|
||||||
plt.savefig(path)
|
plt.savefig(path)
|
||||||
|
|
||||||
import sys
|
# import sys
|
||||||
for epistola in [1]:
|
# for epistola in [1]:
|
||||||
if epistola == 1:
|
# if epistola == 1:
|
||||||
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
# authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||||
paragraph_offset = 1
|
# paragraph_offset = 1
|
||||||
figsize=(3,9)
|
# figsize=(3,9)
|
||||||
label_offset=0.2
|
# label_offset=0.2
|
||||||
|
#
|
||||||
else:
|
# else:
|
||||||
authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
|
# authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
|
||||||
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
# 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
||||||
'GrazioloBambaglioli', 'GuidoDaPisa',
|
# 'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||||
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
# 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
||||||
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
|
# 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
|
||||||
'PietroAlighieri', 'RaimundusLullus',
|
# 'PietroAlighieri', 'RaimundusLullus',
|
||||||
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
# 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
||||||
paragraph_offset = 14
|
# paragraph_offset = 14
|
||||||
figsize = (6,20)
|
# figsize = (6,20)
|
||||||
label_offset=0.3
|
# label_offset=0.3
|
||||||
|
#
|
||||||
attributions = np.load(f'attribution_ep{epistola}.npy')
|
# attributions = np.load(f'attribution_ep{epistola}.npy')
|
||||||
plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
|
# plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
|
||||||
sys.exit(0)
|
# sys.exit(0)
|
||||||
|
|
||||||
for epistola in [1]:
|
for epistola in [1]:
|
||||||
|
|
||||||
|
|
@ -116,7 +116,7 @@ for epistola in [1]:
|
||||||
print('=' * 80)
|
print('=' * 80)
|
||||||
|
|
||||||
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
|
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
|
||||||
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
|
positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
|
||||||
# if len(positive) < 2:
|
# if len(positive) < 2:
|
||||||
# discarded += 1
|
# discarded += 1
|
||||||
# continue
|
# continue
|
||||||
|
|
|
||||||
|
|
@ -111,7 +111,7 @@ for epistola in [1]:
|
||||||
print('=' * 80)
|
print('=' * 80)
|
||||||
|
|
||||||
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)]
|
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)]
|
||||||
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
|
positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
|
||||||
|
|
||||||
n_full_docs = len(positive) + len(negative)
|
n_full_docs = len(positive) + len(negative)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,8 @@ for epistola in [1]:
|
||||||
if epistola==2:
|
if epistola==2:
|
||||||
path+='_interaEpistola'
|
path+='_interaEpistola'
|
||||||
|
|
||||||
positive, negative, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
|
positive, negative, pos_files, neg_files, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
|
||||||
|
files = np.asarray(pos_files + neg_files)
|
||||||
if len(positive) < 2:
|
if len(positive) < 2:
|
||||||
discarded+=1
|
discarded+=1
|
||||||
continue
|
continue
|
||||||
|
|
@ -55,6 +56,7 @@ for epistola in [1]:
|
||||||
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||||
normalize_features=True)
|
normalize_features=True)
|
||||||
|
|
||||||
|
|
||||||
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
||||||
print(ytr)
|
print(ytr)
|
||||||
|
|
||||||
|
|
@ -64,7 +66,7 @@ for epistola in [1]:
|
||||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
|
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
|
||||||
av.fit(Xtr,ytr,groups)
|
av.fit(Xtr,ytr,groups)
|
||||||
|
|
||||||
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True)
|
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True)
|
||||||
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||||
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
|
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
|
||||||
counters.append((tp, fp, fn, tn))
|
counters.append((tp, fp, fn, tn))
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ import os
|
||||||
# TODO: sentence length (Mendenhall-style) ?
|
# TODO: sentence length (Mendenhall-style) ?
|
||||||
|
|
||||||
|
|
||||||
for epistola in [1]:
|
for epistola in [2]:
|
||||||
|
|
||||||
print('Epistola {}'.format(epistola))
|
print('Epistola {}'.format(epistola))
|
||||||
print('='*80)
|
print('='*80)
|
||||||
|
|
@ -26,7 +26,7 @@ for epistola in [1]:
|
||||||
paragraphs = range(14, 91)
|
paragraphs = range(14, 91)
|
||||||
|
|
||||||
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
|
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
|
||||||
positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target)
|
positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target)
|
||||||
|
|
||||||
pickle_file = f'../dante_color/epistola{epistola}.pkl'
|
pickle_file = f'../dante_color/epistola{epistola}.pkl'
|
||||||
if os.path.exists(pickle_file):
|
if os.path.exists(pickle_file):
|
||||||
|
|
@ -35,6 +35,7 @@ for epistola in [1]:
|
||||||
for prob,text in zip(probabilities,ep_texts):
|
for prob,text in zip(probabilities,ep_texts):
|
||||||
text = text.replace('\n','')
|
text = text.replace('\n','')
|
||||||
print(f"{prob:.3f}:{text}")
|
print(f"{prob:.3f}:{text}")
|
||||||
|
print(f'media={np.asarray(probabilities[1:]).mean()}')
|
||||||
else:
|
else:
|
||||||
print(f'generating pickle file')
|
print(f'generating pickle file')
|
||||||
n_full_docs = len(positive) + len(negative)
|
n_full_docs = len(positive) + len(negative)
|
||||||
|
|
@ -50,6 +51,7 @@ for epistola in [1]:
|
||||||
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||||
normalize_features=True)
|
normalize_features=True)
|
||||||
|
|
||||||
|
|
||||||
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
||||||
print(ytr)
|
print(ytr)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ for epistola in [1,2,3]: #3 means "both Ep1 and Ep2 corpora"
|
||||||
paragraphs = range(1, 6)
|
paragraphs = range(1, 6)
|
||||||
|
|
||||||
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs]
|
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs]
|
||||||
positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
|
positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
|
||||||
|
|
||||||
pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl'
|
pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl'
|
||||||
if os.path.exists(pickle_file):
|
if os.path.exists(pickle_file):
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@ from os.path import join
|
||||||
import re
|
import re
|
||||||
import collections
|
import collections
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------
|
# ------------------------------------------------------------------------
|
||||||
# document loading routine
|
# document loading routine
|
||||||
# ------------------------------------------------------------------------
|
# ------------------------------------------------------------------------
|
||||||
|
|
@ -30,7 +32,8 @@ def remove_citations(doc):
|
||||||
|
|
||||||
def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_prefix='EpistolaXIII_'):
|
def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_prefix='EpistolaXIII_'):
|
||||||
# load the training data (all documents but Epistolas 1 and 2)
|
# load the training data (all documents but Epistolas 1 and 2)
|
||||||
positive,negative = [],[]
|
positive, negative = [], []
|
||||||
|
files_positive, files_negative = [], []
|
||||||
authors = []
|
authors = []
|
||||||
ndocs=0
|
ndocs=0
|
||||||
for file in os.listdir(path):
|
for file in os.listdir(path):
|
||||||
|
|
@ -42,8 +45,10 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr
|
||||||
|
|
||||||
if author == positive_author:
|
if author == positive_author:
|
||||||
positive.append(text)
|
positive.append(text)
|
||||||
|
files_positive.append(file)
|
||||||
else:
|
else:
|
||||||
negative.append(text)
|
negative.append(text)
|
||||||
|
files_negative.append(file)
|
||||||
authors.append(author)
|
authors.append(author)
|
||||||
ndocs+=1
|
ndocs+=1
|
||||||
|
|
||||||
|
|
@ -57,10 +62,10 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr
|
||||||
unknown = remove_citations(unknown)
|
unknown = remove_citations(unknown)
|
||||||
unknowns.append(unknown)
|
unknowns.append(unknown)
|
||||||
if len(unknowns) == 1: unknowns = unknowns[0]
|
if len(unknowns) == 1: unknowns = unknowns[0]
|
||||||
return positive, negative, unknowns
|
return positive, negative, files_positive, files_negative, unknowns
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return positive, negative
|
return positive, negative, files_positive, files_negative
|
||||||
|
|
||||||
|
|
||||||
def ___list_texts(path):
|
def ___list_texts(path):
|
||||||
|
|
|
||||||
|
|
@ -80,7 +80,7 @@ class AuthorshipVerificator:
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True, counters=False):
|
def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False):
|
||||||
|
|
||||||
if groups is None:
|
if groups is None:
|
||||||
print('Computing LOO without groups')
|
print('Computing LOO without groups')
|
||||||
|
|
@ -94,7 +94,10 @@ class AuthorshipVerificator:
|
||||||
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
|
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
|
||||||
|
|
||||||
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
||||||
|
missclassified = '\n'.join(files[scores==0].tolist())
|
||||||
print(scores)
|
print(scores)
|
||||||
|
print(missclassified)
|
||||||
|
|
||||||
if counters and test_lowest_index_only:
|
if counters and test_lowest_index_only:
|
||||||
yfull_true = y[:len(folds)]
|
yfull_true = y[:len(folds)]
|
||||||
yfull_predict = np.zeros_like(yfull_true)
|
yfull_predict = np.zeros_like(yfull_true)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue