cleaning
This commit is contained in:
parent
13d9a5ed57
commit
67adaa441c
|
|
@ -1,3 +1,4 @@
|
||||||
|
from mpl_toolkits.axes_grid1 import make_axes_locatable
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from data.dante_loader import load_texts
|
from data.dante_loader import load_texts
|
||||||
from data.features import *
|
from data.features import *
|
||||||
|
|
@ -6,60 +7,104 @@ import numpy as np
|
||||||
import matplotlib
|
import matplotlib
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
def plot_attribution(path, authors, attributions, paragraph_offset=1):
|
def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5,5), label_offset=0.3):
|
||||||
|
|
||||||
paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[1]-1)]
|
attributions = attributions.T
|
||||||
|
print(attributions.shape)
|
||||||
|
# attributions=attributions>0.5
|
||||||
|
paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[0]-1)]
|
||||||
|
|
||||||
fig, ax = plt.subplots()
|
fig, ax = plt.subplots(figsize=figsize)
|
||||||
im = ax.imshow(attributions)
|
|
||||||
|
# im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greens')
|
||||||
|
im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greys')
|
||||||
|
|
||||||
|
# Create colorbar
|
||||||
|
# cbar = fig.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1, pad=0.04)
|
||||||
|
# ax.figure.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1)
|
||||||
|
# ax.figure.colorbar(im, ax=ax, orientation="horizontal", pad=0.05)
|
||||||
|
|
||||||
|
# cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
|
||||||
|
|
||||||
# We want to show all ticks...
|
# We want to show all ticks...
|
||||||
ax.set_xticks(np.arange(len(paragraphs)))
|
# ax.set_xticks(np.arange(len(authors)))
|
||||||
ax.set_yticks(np.arange(len(authors)))
|
ax.set_xticks(np.arange(len(authors) + 0) + label_offset)
|
||||||
|
ax.set_yticks(np.arange(len(paragraphs)))
|
||||||
# ... and label them with the respective list entries
|
# ... and label them with the respective list entries
|
||||||
ax.set_xticklabels(paragraphs)
|
ax.set_xticklabels(authors)
|
||||||
ax.set_yticklabels(authors)
|
ax.set_yticklabels(paragraphs)
|
||||||
|
|
||||||
|
ax.tick_params(top=False, bottom=False,
|
||||||
|
labeltop=True, labelbottom=False)
|
||||||
|
|
||||||
# Rotate the tick labels and set their alignment.
|
# Rotate the tick labels and set their alignment.
|
||||||
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
|
plt.setp(ax.get_xticklabels(), rotation=90, ha="left", rotation_mode="anchor")
|
||||||
rotation_mode="anchor")
|
|
||||||
|
for edge, spine in ax.spines.items():
|
||||||
|
spine.set_visible(False)
|
||||||
|
|
||||||
|
ax.set_xticks(np.arange(len(authors)+1) - .5, minor=True)
|
||||||
|
ax.set_yticks(np.arange(len(paragraphs)+1) - .5, minor=True)
|
||||||
|
|
||||||
|
ax.grid(which="minor", color="k", linestyle='-', linewidth=1)
|
||||||
|
ax.tick_params(which="minor", bottom=False, left=False)
|
||||||
|
|
||||||
# Loop over data dimensions and create text annotations.
|
# Loop over data dimensions and create text annotations.
|
||||||
for i in range(len(authors)):
|
# for i in range(len(authors)):
|
||||||
for j in range(len(paragraphs)):
|
# for j in range(len(paragraphs)):
|
||||||
text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
|
# text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
|
||||||
|
|
||||||
ax.set_title("Attribution matrix")
|
# ax.set_title("Attribution matrix")
|
||||||
fig.tight_layout()
|
fig.tight_layout()
|
||||||
# plt.show()
|
# plt.show()
|
||||||
plt.savefig(path)
|
plt.savefig(path)
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
for epistola in [1]:
|
||||||
attributions = np.load('attribution_ep1.npy')
|
if epistola == 1:
|
||||||
plot_attribution('plot1.pdf', authors, attributions)
|
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||||
|
paragraph_offset = 1
|
||||||
|
figsize=(3,9)
|
||||||
|
label_offset=0.2
|
||||||
|
|
||||||
|
else:
|
||||||
|
authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
|
||||||
|
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
||||||
|
'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||||
|
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
||||||
|
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
|
||||||
|
'PietroAlighieri', 'RaimundusLullus',
|
||||||
|
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
||||||
|
paragraph_offset = 14
|
||||||
|
figsize = (6,20)
|
||||||
|
label_offset=0.3
|
||||||
|
|
||||||
|
attributions = np.load(f'attribution_ep{epistola}.npy')
|
||||||
|
plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
author_attribution = []
|
|
||||||
for epistola in [1]:
|
for epistola in [1]:
|
||||||
|
|
||||||
|
author_attribution = []
|
||||||
print(f'Epistola {epistola}')
|
print(f'Epistola {epistola}')
|
||||||
print('='*80)
|
print('='*80)
|
||||||
path = f'../testi_{epistola}'
|
path = f'../testi_{epistola}'
|
||||||
|
|
||||||
if epistola == 1:
|
if epistola == 1:
|
||||||
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||||
paragraphs = range(1,3)
|
paragraphs = range(1,14)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
|
authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
|
||||||
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
||||||
'GrazioloBambaglioli', 'GuidoDaPisa',
|
'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||||
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
||||||
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna',
|
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
|
||||||
'PietroAlighieri', 'RaimundusLullus',
|
'PietroAlighieri', 'RaimundusLullus',
|
||||||
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
||||||
paragraphs = range(13, 90)
|
paragraphs = range(14, 91)
|
||||||
|
assert len(authors)==20, f'unexpected number of authors ({len(authors)})'
|
||||||
|
path+='_tutti'
|
||||||
|
|
||||||
discarded = 0
|
discarded = 0
|
||||||
f1_scores = []
|
f1_scores = []
|
||||||
|
|
@ -72,9 +117,9 @@ for epistola in [1]:
|
||||||
|
|
||||||
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
|
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
|
||||||
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
|
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
|
||||||
if len(positive) < 2:
|
# if len(positive) < 2:
|
||||||
discarded += 1
|
# discarded += 1
|
||||||
continue
|
# continue
|
||||||
|
|
||||||
n_full_docs = len(positive) + len(negative)
|
n_full_docs = len(positive) + len(negative)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,152 @@
|
||||||
|
from mpl_toolkits.axes_grid1 import make_axes_locatable
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from data.dante_loader import load_texts
|
||||||
|
from data.features import *
|
||||||
|
from model import AuthorshipVerificator, f1_from_counters
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5,5), label_offset=0.3):
|
||||||
|
|
||||||
|
attributions = attributions.T
|
||||||
|
print(attributions.shape)
|
||||||
|
# attributions=attributions>0.5
|
||||||
|
paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[0]-1)]
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=figsize)
|
||||||
|
|
||||||
|
# im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greens')
|
||||||
|
im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greys')
|
||||||
|
|
||||||
|
# Create colorbar
|
||||||
|
# cbar = fig.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1, pad=0.04)
|
||||||
|
# ax.figure.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1)
|
||||||
|
# ax.figure.colorbar(im, ax=ax, orientation="horizontal", pad=0.05)
|
||||||
|
|
||||||
|
# cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
|
||||||
|
|
||||||
|
# We want to show all ticks...
|
||||||
|
# ax.set_xticks(np.arange(len(authors)))
|
||||||
|
ax.set_xticks(np.arange(len(authors) + 0) + label_offset)
|
||||||
|
ax.set_yticks(np.arange(len(paragraphs)))
|
||||||
|
# ... and label them with the respective list entries
|
||||||
|
ax.set_xticklabels(authors)
|
||||||
|
ax.set_yticklabels(paragraphs)
|
||||||
|
|
||||||
|
ax.tick_params(top=False, bottom=False,
|
||||||
|
labeltop=True, labelbottom=False)
|
||||||
|
|
||||||
|
# Rotate the tick labels and set their alignment.
|
||||||
|
plt.setp(ax.get_xticklabels(), rotation=90, ha="left", rotation_mode="anchor")
|
||||||
|
|
||||||
|
for edge, spine in ax.spines.items():
|
||||||
|
spine.set_visible(False)
|
||||||
|
|
||||||
|
ax.set_xticks(np.arange(len(authors)+1) - .5, minor=True)
|
||||||
|
ax.set_yticks(np.arange(len(paragraphs)+1) - .5, minor=True)
|
||||||
|
|
||||||
|
ax.grid(which="minor", color="k", linestyle='-', linewidth=1)
|
||||||
|
ax.tick_params(which="minor", bottom=False, left=False)
|
||||||
|
|
||||||
|
# Loop over data dimensions and create text annotations.
|
||||||
|
# for i in range(len(authors)):
|
||||||
|
# for j in range(len(paragraphs)):
|
||||||
|
# text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
|
||||||
|
|
||||||
|
# ax.set_title("Attribution matrix")
|
||||||
|
fig.tight_layout()
|
||||||
|
# plt.show()
|
||||||
|
plt.savefig(path)
|
||||||
|
|
||||||
|
import sys
|
||||||
|
authors1 = ['ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||||
|
authors2 = ['BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
|
||||||
|
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
||||||
|
'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||||
|
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
||||||
|
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
|
||||||
|
'PietroAlighieri', 'RaimundusLullus',
|
||||||
|
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
||||||
|
authors3 = sorted(np.unique(authors1 + authors2).tolist())
|
||||||
|
|
||||||
|
for epistola in [1]:
|
||||||
|
paragraph_offset = 1
|
||||||
|
label_offset = 0.2
|
||||||
|
if epistola == 1:
|
||||||
|
authors = ['Dante'] + authors1
|
||||||
|
figsize = (4, 4)
|
||||||
|
elif epistola == 2:
|
||||||
|
authors = ['Dante'] + authors2
|
||||||
|
figsize = (6, 4)
|
||||||
|
else:
|
||||||
|
authors = ['Dante'] + authors3
|
||||||
|
|
||||||
|
attributions = np.load(f'attribution_ep{epistola}_xiv.npy')
|
||||||
|
plot_attribution(f'plot{epistola}_xiv.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
for epistola in [1]:
|
||||||
|
|
||||||
|
author_attribution = []
|
||||||
|
print(f'Epistola {epistola}')
|
||||||
|
print('='*80)
|
||||||
|
path = f'../testiXIV_{epistola}'
|
||||||
|
|
||||||
|
|
||||||
|
if epistola == 1:
|
||||||
|
authors = ['Dante'] + authors1
|
||||||
|
elif epistola == 2:
|
||||||
|
authors = ['Dante'] + authors2
|
||||||
|
path += '_tutti'
|
||||||
|
else:
|
||||||
|
authors = ['Dante'] + authors3
|
||||||
|
|
||||||
|
discarded = 0
|
||||||
|
f1_scores = []
|
||||||
|
counters = []
|
||||||
|
for i, author in enumerate(authors):
|
||||||
|
print('=' * 80)
|
||||||
|
print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
|
||||||
|
print('Corpus of Epistola {}'.format(epistola))
|
||||||
|
print('=' * 80)
|
||||||
|
|
||||||
|
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)]
|
||||||
|
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
|
||||||
|
|
||||||
|
n_full_docs = len(positive) + len(negative)
|
||||||
|
|
||||||
|
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||||
|
conjugations_freq='latin',
|
||||||
|
features_Mendenhall=True,
|
||||||
|
features_sentenceLengths=True,
|
||||||
|
tfidf_feat_selection_ratio=0.1,
|
||||||
|
wordngrams=True, n_wordngrams=(1, 2),
|
||||||
|
charngrams=True, n_charngrams=(3, 4, 5),
|
||||||
|
preserve_punctuation=False,
|
||||||
|
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||||
|
normalize_features=True)
|
||||||
|
|
||||||
|
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||||
|
|
||||||
|
print('Fitting the Verificator')
|
||||||
|
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
|
||||||
|
av.fit(Xtr, ytr, groups)
|
||||||
|
|
||||||
|
attributions=[]
|
||||||
|
for i,target_text in enumerate(ep_texts):
|
||||||
|
ep = feature_extractor.transform(target_text, avoid_splitting=True)
|
||||||
|
prob,_ = av.predict_proba(ep, epistola_name=target[i])
|
||||||
|
attributions.append(prob)
|
||||||
|
author_attribution.append(attributions)
|
||||||
|
|
||||||
|
author_attribution = np.asarray(author_attribution)
|
||||||
|
attribution_path = f'attribution_ep{epistola}_xiv.npy'
|
||||||
|
print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}')
|
||||||
|
np.save(attribution_path, author_attribution)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,77 @@
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from data.dante_loader import load_texts
|
||||||
|
from data.features import *
|
||||||
|
from model import AuthorshipVerificator, f1_from_counters
|
||||||
|
from sklearn.svm import LinearSVC, SVC
|
||||||
|
from util.color_visualization import color
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
|
||||||
|
for epistola in [1,2,3]: #3 means "both Ep1 and Ep2 corpora"
|
||||||
|
|
||||||
|
print('Epistola {}'.format(epistola))
|
||||||
|
print('='*80)
|
||||||
|
path = '../testiXIV_{}'.format(epistola)
|
||||||
|
paragraphs = range(1, 6)
|
||||||
|
if epistola==2:
|
||||||
|
path+='_tutti'
|
||||||
|
|
||||||
|
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs]
|
||||||
|
positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
|
||||||
|
|
||||||
|
pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl'
|
||||||
|
if os.path.exists(pickle_file):
|
||||||
|
print(f'loading pickle file {pickle_file}')
|
||||||
|
probabilities = pickle.load(open(pickle_file, 'rb'))
|
||||||
|
else:
|
||||||
|
print(f'generating pickle file')
|
||||||
|
n_full_docs = len(positive) + len(negative)
|
||||||
|
|
||||||
|
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||||
|
conjugations_freq='latin',
|
||||||
|
features_Mendenhall=True,
|
||||||
|
features_sentenceLengths=True,
|
||||||
|
tfidf_feat_selection_ratio=0.1,
|
||||||
|
wordngrams=True, n_wordngrams=(1, 2),
|
||||||
|
charngrams=True, n_charngrams=(3, 4, 5),
|
||||||
|
preserve_punctuation=False,
|
||||||
|
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||||
|
normalize_features=True)
|
||||||
|
|
||||||
|
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
||||||
|
print(ytr)
|
||||||
|
|
||||||
|
print('Fitting the Verificator')
|
||||||
|
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name='Dante')
|
||||||
|
av.fit(Xtr,ytr,groups)
|
||||||
|
|
||||||
|
probabilities = []
|
||||||
|
for i, target_text in enumerate(ep_texts):
|
||||||
|
ep = feature_extractor.transform(target_text, avoid_splitting=True)
|
||||||
|
prob, _ = av.predict_proba(ep, epistola_name=target[i])
|
||||||
|
probabilities.append(prob)
|
||||||
|
|
||||||
|
pickle.dump(probabilities, open(pickle_file, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
color(path=f'../dante_color/epistola{epistola}_xiv.html', texts=ep_texts,
|
||||||
|
probabilities=probabilities, title=f'Epistola {epistola}',
|
||||||
|
paragraph_offset=paragraphs[0])
|
||||||
|
|
||||||
|
|
||||||
|
# print('Predicting the Epistola {}'.format(epistola))
|
||||||
|
# title = 'Epistola {}'.format('I' if epistola==1 else 'II')
|
||||||
|
# av.predict(ep, title)
|
||||||
|
# fulldoc_prob, fragment_probs = av.predict_proba(ep, title)
|
||||||
|
# color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title)
|
||||||
|
|
||||||
|
# score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False)
|
||||||
|
# print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||||
|
|
||||||
|
# score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True)
|
||||||
|
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||||
|
# f1_ = f1_from_counters(tp, fp, fn, tn)
|
||||||
|
# print('F1 = {:.3f}'.format(f1_))
|
||||||
|
|
||||||
|
# score_ave, score_std = av.leave_one_out(Xtr, ytr, None)
|
||||||
|
# print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||||
|
|
||||||
|
|
@ -0,0 +1,35 @@
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
# Have colormaps separated into categories:
|
||||||
|
# http://matplotlib.org/examples/color/colormaps_reference.html
|
||||||
|
cmaps = [('Diverging', ['RdYlGn','RdYlGn']),]
|
||||||
|
|
||||||
|
|
||||||
|
nrows = max(len(cmap_list) for cmap_category, cmap_list in cmaps)
|
||||||
|
gradient = np.linspace(0.25, 0.75, 256)
|
||||||
|
gradient = np.vstack((gradient, gradient))
|
||||||
|
|
||||||
|
|
||||||
|
def plot_color_gradients(cmap_category, cmap_list, nrows):
|
||||||
|
fig, axes = plt.subplots(nrows=nrows)
|
||||||
|
fig.subplots_adjust(top=0.95, bottom=0.01, left=0.2, right=0.99)
|
||||||
|
axes[0].set_title(cmap_category + ' colormaps', fontsize=14)
|
||||||
|
|
||||||
|
for ax, name in zip(axes, cmap_list):
|
||||||
|
ax.imshow(gradient, aspect='auto', cmap=plt.get_cmap(name))
|
||||||
|
pos = list(ax.get_position().bounds)
|
||||||
|
x_text = pos[0] - 0.01
|
||||||
|
y_text = pos[1] + pos[3]/2.
|
||||||
|
fig.text(x_text, y_text, name, va='center', ha='right', fontsize=10)
|
||||||
|
|
||||||
|
# Turn off *all* ticks & spines, not just the ones with colormaps.
|
||||||
|
for ax in axes:
|
||||||
|
ax.set_axis_off()
|
||||||
|
|
||||||
|
|
||||||
|
for cmap_category, cmap_list in cmaps:
|
||||||
|
plot_color_gradients(cmap_category, cmap_list, nrows)
|
||||||
|
|
||||||
|
plt.show()
|
||||||
|
|
@ -0,0 +1,59 @@
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from data.dante_loader import load_texts
|
||||||
|
from data.features import *
|
||||||
|
from model import AuthorshipVerificator, f1_from_counters
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
for epistola in [2]:
|
||||||
|
|
||||||
|
author_attribution = []
|
||||||
|
print(f'Epistola {epistola}')
|
||||||
|
print('='*80)
|
||||||
|
path = f'../testi_{epistola}'
|
||||||
|
if epistola==2: path+='_tutti'
|
||||||
|
|
||||||
|
author = 'Dante'
|
||||||
|
print('=' * 80)
|
||||||
|
print('Corpus of Epistola {}'.format(epistola))
|
||||||
|
print('=' * 80)
|
||||||
|
|
||||||
|
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=f'EpistolaXIII_{epistola}.txt')
|
||||||
|
|
||||||
|
n_full_docs = len(positive) + len(negative)
|
||||||
|
|
||||||
|
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||||
|
conjugations_freq='latin',
|
||||||
|
features_Mendenhall=True,
|
||||||
|
features_sentenceLengths=True,
|
||||||
|
tfidf_feat_selection_ratio=0.1,
|
||||||
|
wordngrams=True, n_wordngrams=(1, 2),
|
||||||
|
charngrams=True, n_charngrams=(3, 4, 5),
|
||||||
|
preserve_punctuation=False,
|
||||||
|
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||||
|
normalize_features=True)
|
||||||
|
|
||||||
|
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||||
|
|
||||||
|
print('Fitting the Verificator')
|
||||||
|
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
|
||||||
|
av.fit(Xtr, ytr, groups)
|
||||||
|
|
||||||
|
feat_rank = np.argsort(av.estimator.coef_[0])
|
||||||
|
coef_ordered = av.estimator.coef_[0][feat_rank]
|
||||||
|
feat_name_ordered = feature_extractor.feature_names[feat_rank]
|
||||||
|
|
||||||
|
print('Most Dantesque features::')
|
||||||
|
for i in range(100):
|
||||||
|
print(f'{i}: {feat_name_ordered[::-1][i]} {coef_ordered[::-1][i]:.3f}')
|
||||||
|
|
||||||
|
print('\nMost Non-Dantesque features::')
|
||||||
|
for i in range(100):
|
||||||
|
print(f'{i}: {feat_name_ordered[i]} {coef_ordered[i]:.3f}')
|
||||||
|
|
||||||
|
|
||||||
|
print('done')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Loading…
Reference in New Issue