This commit is contained in:
Alejandro Moreo Fernandez 2020-04-01 17:17:56 +02:00
parent b4796f4882
commit b1376026c4
15 changed files with 286 additions and 1112 deletions

View File

@ -1,159 +0,0 @@
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.linear_model import LogisticRegression
from data.dante_loader import load_texts
from data.features import *
from model import AuthorshipVerificator, f1_from_counters
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5,5), label_offset=0.3):
attributions = attributions.T
print(attributions.shape)
# attributions=attributions>0.5
paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[0]-1)]
fig, ax = plt.subplots(figsize=figsize)
# im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greens')
im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greys')
# Create colorbar
# cbar = fig.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1, pad=0.04)
# ax.figure.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1)
# ax.figure.colorbar(im, ax=ax, orientation="horizontal", pad=0.05)
# cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
# We want to show all ticks...
# ax.set_xticks(np.arange(len(authors)))
ax.set_xticks(np.arange(len(authors) + 0) + label_offset)
ax.set_yticks(np.arange(len(paragraphs)))
# ... and label them with the respective list entries
ax.set_xticklabels(authors)
ax.set_yticklabels(paragraphs)
ax.tick_params(top=False, bottom=False,
labeltop=True, labelbottom=False)
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=90, ha="left", rotation_mode="anchor")
for edge, spine in ax.spines.items():
spine.set_visible(False)
ax.set_xticks(np.arange(len(authors)+1) - .5, minor=True)
ax.set_yticks(np.arange(len(paragraphs)+1) - .5, minor=True)
ax.grid(which="minor", color="k", linestyle='-', linewidth=1)
ax.tick_params(which="minor", bottom=False, left=False)
# Loop over data dimensions and create text annotations.
# for i in range(len(authors)):
# for j in range(len(paragraphs)):
# text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
# ax.set_title("Attribution matrix")
fig.tight_layout()
# plt.show()
plt.savefig(path)
# import sys
# for epistola in [1]:
# if epistola == 1:
# authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
# paragraph_offset = 1
# figsize=(3,9)
# label_offset=0.2
#
# else:
# authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
# 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
# 'GrazioloBambaglioli', 'GuidoDaPisa',
# 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
# 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
# 'PietroAlighieri', 'RaimundusLullus',
# 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
# paragraph_offset = 14
# figsize = (6,20)
# label_offset=0.3
#
# attributions = np.load(f'attribution_ep{epistola}.npy')
# plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
# sys.exit(0)
for epistola in [1]:
author_attribution = []
print(f'Epistola {epistola}')
print('='*80)
path = f'../testi_{epistola}'
if epistola == 1:
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
paragraphs = range(1,14)
else:
authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
'GrazioloBambaglioli', 'GuidoDaPisa',
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
'PietroAlighieri', 'RaimundusLullus',
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
paragraphs = range(14, 91)
assert len(authors)==20, f'unexpected number of authors ({len(authors)})'
discarded = 0
f1_scores = []
counters = []
for i, author in enumerate(authors):
print('=' * 80)
print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
print('Corpus of Epistola {}'.format(epistola))
print('=' * 80)
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
# if len(positive) < 2:
# discarded += 1
# continue
n_full_docs = len(positive) + len(negative)
feature_extractor = FeatureExtractor(function_words_freq='latin',
conjugations_freq='latin',
features_Mendenhall=True,
features_sentenceLengths=True,
tfidf_feat_selection_ratio=0.1,
wordngrams=True, n_wordngrams=(1, 2),
charngrams=True, n_charngrams=(3, 4, 5),
preserve_punctuation=False,
split_documents=True, split_policy=split_by_sentences, window_size=3,
normalize_features=True)
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
print('Fitting the Verificator')
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
av.fit(Xtr, ytr, groups)
attributions=[]
for i,target_text in enumerate(ep_texts):
ep = feature_extractor.transform(target_text, avoid_splitting=True)
prob,_ = av.predict_proba(ep, epistola_name=target[i])
attributions.append(prob)
author_attribution.append(attributions)
author_attribution = np.asarray(author_attribution)
attribution_path = f'attribution_ep{epistola}.npy'
print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}')
np.save(attribution_path, author_attribution)

View File

@ -1,151 +0,0 @@
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.linear_model import LogisticRegression
from data.dante_loader import load_texts
from data.features import *
from model import AuthorshipVerificator, f1_from_counters
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5,5), label_offset=0.3):
attributions = attributions.T
print(attributions.shape)
# attributions=attributions>0.5
paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[0]-1)]
fig, ax = plt.subplots(figsize=figsize)
# im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greens')
im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greys')
# Create colorbar
# cbar = fig.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1, pad=0.04)
# ax.figure.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1)
# ax.figure.colorbar(im, ax=ax, orientation="horizontal", pad=0.05)
# cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
# We want to show all ticks...
# ax.set_xticks(np.arange(len(authors)))
ax.set_xticks(np.arange(len(authors) + 0) + label_offset)
ax.set_yticks(np.arange(len(paragraphs)))
# ... and label them with the respective list entries
ax.set_xticklabels(authors)
ax.set_yticklabels(paragraphs)
ax.tick_params(top=False, bottom=False,
labeltop=True, labelbottom=False)
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=90, ha="left", rotation_mode="anchor")
for edge, spine in ax.spines.items():
spine.set_visible(False)
ax.set_xticks(np.arange(len(authors)+1) - .5, minor=True)
ax.set_yticks(np.arange(len(paragraphs)+1) - .5, minor=True)
ax.grid(which="minor", color="k", linestyle='-', linewidth=1)
ax.tick_params(which="minor", bottom=False, left=False)
# Loop over data dimensions and create text annotations.
# for i in range(len(authors)):
# for j in range(len(paragraphs)):
# text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
# ax.set_title("Attribution matrix")
fig.tight_layout()
# plt.show()
plt.savefig(path)
import sys
authors1 = ['ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
authors2 = ['BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
'GrazioloBambaglioli', 'GuidoDaPisa',
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
'PietroAlighieri', 'RaimundusLullus',
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
authors3 = sorted(np.unique(authors1 + authors2).tolist())
for epistola in [1]:
paragraph_offset = 1
label_offset = 0.2
if epistola == 1:
authors = ['Dante'] + authors1
figsize = (4, 4)
elif epistola == 2:
authors = ['Dante'] + authors2
figsize = (6, 4)
else:
authors = ['Dante'] + authors3
attributions = np.load(f'attribution_ep{epistola}_xiv.npy')
plot_attribution(f'plot{epistola}_xiv.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
sys.exit(0)
for epistola in [1]:
author_attribution = []
print(f'Epistola {epistola}')
print('='*80)
path = f'../testiXIV_{epistola}'
if epistola == 1:
authors = ['Dante'] + authors1
elif epistola == 2:
authors = ['Dante'] + authors2
else:
authors = ['Dante'] + authors3
discarded = 0
f1_scores = []
counters = []
for i, author in enumerate(authors):
print('=' * 80)
print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
print('Corpus of Epistola {}'.format(epistola))
print('=' * 80)
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)]
positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
n_full_docs = len(positive) + len(negative)
feature_extractor = FeatureExtractor(function_words_freq='latin',
conjugations_freq='latin',
features_Mendenhall=True,
features_sentenceLengths=True,
tfidf_feat_selection_ratio=0.1,
wordngrams=True, n_wordngrams=(1, 2),
charngrams=True, n_charngrams=(3, 4, 5),
preserve_punctuation=False,
split_documents=True, split_policy=split_by_sentences, window_size=3,
normalize_features=True)
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
print('Fitting the Verificator')
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
av.fit(Xtr, ytr, groups)
attributions=[]
for i,target_text in enumerate(ep_texts):
ep = feature_extractor.transform(target_text, avoid_splitting=True)
prob,_ = av.predict_proba(ep, epistola_name=target[i])
attributions.append(prob)
author_attribution.append(attributions)
author_attribution = np.asarray(author_attribution)
attribution_path = f'attribution_ep{epistola}_xiv.npy'
print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}')
np.save(attribution_path, author_attribution)

View File

@ -1,42 +1,38 @@
from sklearn.linear_model import LogisticRegression
from data.dante_loader import load_texts
from data.dante_loader import load_latin_corpus, list_authors
from data.features import *
from model import AuthorshipVerificator, f1_from_counters
from model import AuthorshipVerificator
from util.evaluation import f1_from_counters
import argparse
AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', 'GrazioloBambaglioli', 'GuidoDaPisa',
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', 'IohannesDePlanoCarpini',
'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus',
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
for epistola in [1]:
if epistola==1:
authors = ['Dante','ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba','PierDellaVigna']
else:
authors = ['Dante', 'BeneFlorentinus','BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
'GrazioloBambaglioli', 'GuidoDaPisa',
'GuidoDeColumnis', 'GuidoFaba','IacobusDeVaragine','IohannesDeAppia',
'IohannesDePlanoCarpini','IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna',
'PietroAlighieri', 'RaimundusLullus',
'RyccardusDeSanctoGermano','ZonoDeMagnalis']
def main():
discarded = 0
f1_scores = []
counters = []
for i,author in enumerate(authors):
for i, author in enumerate(args.authors):
path = args.corpuspath
print('='*80)
print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
print('Corpus of Epistola {}'.format(epistola))
print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})')
print(f'Corpus {path}')
print('='*80)
path = '../testi_{}'.format(epistola)
if epistola==2:
path+='_interaEpistola'
positive, negative, pos_files, neg_files, ep_text = load_texts(
path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola)
positive, negative, pos_files, neg_files, ep_text = load_latin_corpus(
path, positive_author=author, unknown_target=args.unknown
)
files = np.asarray(pos_files + neg_files)
if len(positive) < 2:
discarded+=1
discarded += 1
continue
n_full_docs = len(positive) + len(negative)
print(f'read {n_full_docs} documents from {path}')
feature_extractor = FeatureExtractor(
function_words_freq='latin',
@ -53,33 +49,63 @@ for epistola in [1]:
normalize_features=True
)
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
print(ytr)
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
print('Fitting the Verificator')
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
av.fit(Xtr,ytr,groups)
if args.unknown:
print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(
Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
)
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
counters.append((tp, fp, fn, tn))
print('F1 for {author} = {f1_scores[-1]:.3f}')
print('Fitting the Verificator')
av.fit(Xtr, ytr, groups)
av.predict_proba(ep, args.unknown)
if args.loo:
print('Validating the Verificator (Leave-One-Out)')
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(
Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
)
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
counters.append((tp, fp, fn, tn))
print(f'F1 for {author} = {f1_scores[-1]:.3f}')
if args.loo:
print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
f1_scores = np.array(f1_scores)
counters = np.array(counters)
macro_f1 = f1_scores.mean()
micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
print(f'Macro-F1 = {macro_f1:.3f}')
print(f'Micro-F1 = {micro_f1:.3f}')
print()
print(f'Computing macro- and micro-averages (discarded {discarded}/{len(authors)})')
f1_scores = np.array(f1_scores)
counters = np.array(counters)
if __name__ == '__main__':
import os
macro_f1 = f1_scores.mean()
micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
# Training settings
parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII')
parser.add_argument('corpuspath', type=str, metavar='PATH',
help=f'Path to the directory containing the corpus (documents must be named <author>_<texname>.txt')
parser.add_argument('positive', type=str, default="Dante",
help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check every author')
parser.add_argument('--loo', default=False, action='store_true',
help='submit each binary classifier to leave-one-out validation')
parser.add_argument('--unknown', type=str, default=None,
help='path to the file of unknown paternity (default None)')
print(f'Macro-F1 = {macro_f1:.3f}')
print(f'Micro-F1 = {micro_f1:.3f}')
print()
args = parser.parse_args()
if args.positive == 'ALL':
args.authors = list_authors(args.corpuspath, skip_prefix='Epistola')
else:
if (args.positive not in AUTHORS_CORPUS_I) and (args.positive in AUTHORS_CORPUS_II):
print(f'warning: author {args.positive} is not in the known list of authors for CORPUS I nor CORPUS II')
assert args.positive in list_authors(args.corpuspath, skip_prefix='Epistola'), 'unexpected author'
args.authors = [args.positive]
assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.'
assert args.unknown is None or os.path.exists(args.unknown), 'unknown file does not exist'
main()

View File

@ -1,89 +0,0 @@
from sklearn.linear_model import LogisticRegression
from data.dante_loader import load_texts
from data.features import *
from model import AuthorshipVerificator, f1_from_counters
from sklearn.svm import LinearSVC, SVC
from util.color_visualization import color
import pickle
import os
# DONE: ngrams should contain punctuation marks according to Sapkota et al. [39] in the PAN 2015 overview
# (More recently, it was shown that character
# n-grams corresponding to word affixes and including punctuation marks are the most
# significant features in cross-topic authorship attribution [57].) #we have cancelled the
# TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection
# TODO: sentence length (Mendenhall-style) ?
from src.data.features import FeatureExtractor
for epistola in [1,2]:
print('Epistola {}'.format(epistola))
print('='*80)
path = '../testi_{}'.format(epistola)
if epistola==1:
paragraphs = range(1, 14)
if epistola==2:
paragraphs = range(14, 91)
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_new.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target)
pickle_file = f'../dante_color/epistola{epistola}.pkl'
if os.path.exists(pickle_file):
print(f'loading pickle file {pickle_file}')
probabilities = pickle.load(open(pickle_file, 'rb'))
for prob,text in zip(probabilities,ep_texts):
text = text.replace('\n','')
print(f"{prob:.3f}:{text}")
print(f'media={np.asarray(probabilities[1:]).mean()}')
else:
print(f'generating pickle file')
n_full_docs = len(positive) + len(negative)
feature_extractor = FeatureExtractor(function_words_freq='latin',
conjugations_freq='latin',
features_Mendenhall=True,
features_sentenceLengths=True,
tfidf_feat_selection_ratio=0.1,
wordngrams=True, n_wordngrams=(1, 2),
charngrams=True, n_charngrams=(3, 4, 5),
preserve_punctuation=False,
split_documents=True, split_policy=split_by_sentences, window_size=3,
normalize_features=True)
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
print(ytr)
print('Fitting the Verificator')
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name='Dante')
av.fit(Xtr,ytr,groups)
probabilities = []
for i, target_text in enumerate(ep_texts):
ep = feature_extractor.transform(target_text, avoid_splitting=True)
prob, _ = av.predict_proba(ep, epistola_name=target[i])
probabilities.append(prob)
pickle.dump(probabilities, open(pickle_file, 'wb'), pickle.HIGHEST_PROTOCOL)
color(path=f'../dante_color/epistola{epistola}.html', texts=ep_texts, probabilities=probabilities, title=f'Epistola {("I" if epistola==1 else "II")}', paragraph_offset=paragraphs[0])
# print('Predicting the Epistola {}'.format(epistola))
# title = 'Epistola {}'.format('I' if epistola==1 else 'II')
# av.predict(ep, title)
# fulldoc_prob, fragment_probs = av.predict_proba(ep, title)
# color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title)
# score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False)
# print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std))
# score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True)
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
# f1_ = f1_from_counters(tp, fp, fn, tn)
# print('F1 = {:.3f}'.format(f1_))
# score_ave, score_std = av.leave_one_out(Xtr, ytr, None)
# print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std))

View File

@ -1,75 +0,0 @@
from sklearn.linear_model import LogisticRegression
from data.dante_loader import load_texts
from data.features import *
from model import AuthorshipVerificator, f1_from_counters
from sklearn.svm import LinearSVC, SVC
from util.color_visualization import color
import pickle
import os
for epistola in [1,2,3]: #3 means "both Ep1 and Ep2 corpora"
print('Epistola {}'.format(epistola))
print('='*80)
path = '../testiXIV_{}'.format(epistola)
paragraphs = range(1, 6)
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs]
positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl'
if os.path.exists(pickle_file):
print(f'loading pickle file {pickle_file}')
probabilities = pickle.load(open(pickle_file, 'rb'))
else:
print(f'generating pickle file')
n_full_docs = len(positive) + len(negative)
feature_extractor = FeatureExtractor(function_words_freq='latin',
conjugations_freq='latin',
features_Mendenhall=True,
features_sentenceLengths=True,
tfidf_feat_selection_ratio=0.1,
wordngrams=True, n_wordngrams=(1, 2),
charngrams=True, n_charngrams=(3, 4, 5),
preserve_punctuation=False,
split_documents=True, split_policy=split_by_sentences, window_size=3,
normalize_features=True)
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
print(ytr)
print('Fitting the Verificator')
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name='Dante')
av.fit(Xtr,ytr,groups)
probabilities = []
for i, target_text in enumerate(ep_texts):
ep = feature_extractor.transform(target_text, avoid_splitting=True)
prob, _ = av.predict_proba(ep, epistola_name=target[i])
probabilities.append(prob)
pickle.dump(probabilities, open(pickle_file, 'wb'), pickle.HIGHEST_PROTOCOL)
color(path=f'../dante_color/epistola{epistola}_xiv.html', texts=ep_texts,
probabilities=probabilities, title=f'Epistola {epistola}',
paragraph_offset=paragraphs[0])
# print('Predicting the Epistola {}'.format(epistola))
# title = 'Epistola {}'.format('I' if epistola==1 else 'II')
# av.predict(ep, title)
# fulldoc_prob, fragment_probs = av.predict_proba(ep, title)
# color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title)
# score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False)
# print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std))
# score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True)
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
# f1_ = f1_from_counters(tp, fp, fn, tn)
# print('F1 = {:.3f}'.format(f1_))
# score_ave, score_std = av.leave_one_out(Xtr, ytr, None)
# print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std))

View File

@ -1,14 +1,11 @@
import os
from os.path import join
import re
import collections
# ------------------------------------------------------------------------
# document loading routine
# ------------------------------------------------------------------------
def remove_pattern(doc, start_symbol, end_symbol, counter):
assert counter[start_symbol] == counter[end_symbol], 'wrong number of {}{} found'.format(start_symbol,end_symbol)
search = True
@ -21,6 +18,7 @@ def remove_pattern(doc, start_symbol, end_symbol, counter):
search = False
return doc
# removes citations in format:
# *latino*
# {volgare}
@ -30,16 +28,30 @@ def remove_citations(doc):
doc = remove_pattern(doc, start_symbol='{', end_symbol='}', counter=counter)
return doc
def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_prefix='EpistolaXIII_'):
def load_latin_corpus(path, positive_author='Dante', unknown_target=None, train_skip_prefix='Epistola'):
"""
Function used to load the Corpus I and Corpus II for authorship verification (and validation) of the Epistola XIII.
The corpus is assumed to contain files named according to <author>_<text_name>.txt.
:param path: the path containing the texts, each named as <author>_<text_name>.txt
:param positive_author: the author that defines the positive class for verification
:param unknown_target: if specified, is the path to the unknown document whose paternity is to be check (w.r.t.
the positive_author)
:param train_skip_prefix: specify a prefix for documents that should be skipped
:return: a tuple containing the positive documents, negative documents, paths to positive documents, paths to
negative documents, and the unknown document if that was specified (otherwise an empty list)
"""
# load the training data (all documents but Epistolas 1 and 2)
positive, negative = [], []
files_positive, files_negative = [], []
authors = []
authors = []
ndocs=0
for file in os.listdir(path):
if file.startswith(train_skip_prefix): continue
file_clean = file.replace('.txt','')
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
if f'{path}/{file}' == unknown_target: continue
file_name = file.replace('.txt','')
author, textname = file_name.split('_')
text = open(join(path,file), encoding= "utf8").read()
text = remove_citations(text)
@ -50,40 +62,21 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr
negative.append(text)
files_negative.append(file)
authors.append(author)
ndocs+=1
ndocs += 1
# load the test data (Epistolas 1 and 2)
# load the unknown document (if requested))
if unknown_target:
if isinstance(unknown_target, str):
unknown_target = [unknown_target]
unknowns = []
for unknown_text in unknown_target:
unknown = open(join(path, unknown_text), encoding="utf8").read()
unknown = remove_citations(unknown)
unknowns.append(unknown)
if len(unknowns) == 1: unknowns = unknowns[0]
return positive, negative, files_positive, files_negative, unknowns
unknown = open(unknown_target, encoding="utf8").read()
unknown = [remove_citations(unknown)]
else:
return positive, negative, files_positive, files_negative
def ___list_texts(path):
authors = {}
for file in os.listdir(path):
if file.startswith('EpistolaXIII_'): continue
file_clean = file.replace('.txt','')
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
if author not in authors:
authors[author] = []
authors[author].append(textname)
author_order = sorted(authors.keys())
for author in author_order:
print('{}:\t{}'.format(author,', '.join(authors[author])))
unknown = []
return positive, negative, files_positive, files_negative, unknown
def list_authors(path, skip_prefix, skip_authors=['Misc']):
authors = [file.split('_')[0] for file in os.listdir(path) if not file.startswith(skip_prefix)]
authors = [author for author in authors if author not in skip_authors]
return sorted(set(authors))

View File

@ -39,64 +39,55 @@ latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amus
'sis', 'sit', 'simus', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemus', 'essetis', 'essent',
'fui', 'fuisti', 'fuit', 'fuimus', 'fuistis', 'fuerunt', 'este', 'esto', 'estote', 'sunto']
spanish_conjugations = ['o','as','a','amos','áis','an','es','e','emos','éis','en','imos','ís','guir','ger','gir',
'ar', 'er', 'ir', 'é', 'aste', 'ó','asteis','aron','í','iste','','isteis','ieron',
'aba', 'abas', 'ábamos', 'aban', 'ía', 'ías', 'íamos', 'íais', 'ían', 'ás','á',
'án','estoy','estás','está','estamos','estáis','están']
def get_function_words(lang):
if lang=='latin':
if lang == 'latin':
return latin_function_words
elif lang in ['english','spanish']:
return stopwords.words(lang)
else:
raise ValueError('{} not in scope!'.format(lang))
def get_conjugations(lang):
if lang == 'latin':
return latin_conjugations
elif lang == 'spanish':
return spanish_conjugations
else:
raise ValueError('conjugations for languages other than Latin and Spanish are not yet supported')
raise ValueError('conjugations for languages other than Latin are not yet supported')
# ------------------------------------------------------------------------
# split policies
# ------------------------------------------------------------------------
# TODO: implement other split policies (e.g., overlapping ones, etc)
def split_by_endline(text):
return [t.strip() for t in text.split('\n') if t.strip()]
def split_by_sentences(text):
sentences = [t.strip() for t in nltk.tokenize.sent_tokenize(text) if t.strip()]
#sentences= [t.strip() for t in re.split(r"\.|\?|\!\;", text) if t.strip()]
for i,sentence in enumerate(sentences):
unmod_tokens = nltk.tokenize.word_tokenize(sentence)
mod_tokens = ([token for token in unmod_tokens if any(char.isalpha() for char in token)])
if len(mod_tokens)<8:
if i<len(sentences)-1:
if i < len(sentences)-1:
sentences[i+1] = sentences[i] + ' ' + sentences[i+1]
else:
sentences[i-1] = sentences[i-1] + ' ' + sentences[i]
sentences.pop(i)
return sentences
def windows(text_fragments, window_size):
new_fragments = []
nbatches = len(text_fragments) // window_size
if len(text_fragments) % window_size > 0:
nbatches+=1
# for i in range(len(text_fragments)-window_size+1):
nbatches += 1
for i in range(nbatches):
offset = i*window_size
new_fragments.append(' '.join(text_fragments[offset:offset+window_size]))
return new_fragments
def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1):
fragments = []
authors_fragments = []
@ -117,7 +108,7 @@ def splitter(documents, authors=None, split_policy=split_by_sentences, window_si
def tokenize(text):
unmod_tokens = nltk.word_tokenize(text)
return ([token.lower() for token in unmod_tokens if any(char.isalpha() for char in token)])
return [token.lower() for token in unmod_tokens if any(char.isalpha() for char in token)]
# ------------------------------------------------------------------------
@ -125,7 +116,7 @@ def tokenize(text):
# ------------------------------------------------------------------------
def _features_function_words_freq(documents, lang):
"""
Extract features as the frequency (x1000) of the function words used in the documents
Extract features as the frequency (L1x1000) of the function words used in the documents
:param documents: a list where each element is the text (string) of a document
:return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
"""
@ -145,6 +136,12 @@ def _features_function_words_freq(documents, lang):
def _features_conjugations_freq(documents, lang):
"""
Extract features as the frequency (L1x1000) of the conjugations used in the documents. The method is heuristic, and
actually searches for suffixes contained in the conjugation list.
:param documents: a list where each element is the text (string) of a document
:return: a np.array of shape (D,F) where D is len(documents) and F is len(conjugations)
"""
features = []
conjugations = get_conjugations(lang)
@ -152,7 +149,9 @@ def _features_conjugations_freq(documents, lang):
mod_tokens = tokenize(text)
conjugation_tokens = []
for conjugation in conjugations:
conjugation_tokens.extend([conjugation for token in mod_tokens if token.endswith(conjugation) and len(token)>len(conjugation)])
conjugation_tokens.extend(
[conjugation for token in mod_tokens if token.endswith(conjugation) and len(token) > len(conjugation)]
)
freqs = nltk.FreqDist(conjugation_tokens)
nwords = len(mod_tokens)
conjugation_freq = [1000. * freqs[conjugation] / nwords for conjugation in conjugations]
@ -165,7 +164,7 @@ def _features_conjugations_freq(documents, lang):
def _features_Mendenhall(documents, upto=23):
"""
Extract features as the frequency (x1000) of the words' lengths used in the documents,
Extract features as the frequency (L1x1000) of the words' lengths used in the documents,
following the idea behind Mendenhall's Characteristic Curve of Composition
:param documents: a list where each element is the text (string) of a document
:return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
@ -213,7 +212,7 @@ def _features_sentenceLengths(documents, downto=3, upto=70):
return np.array(features), f_names
def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)):
def _features_tfidf(documents, tfidf_vectorizer=None, min_df=1, ngrams=(1, 1)):
"""
Extract features as tfidf matrix extracted from the documents
:param documents: a list where each element is the text (string) of a document
@ -229,14 +228,22 @@ def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)):
return features, tfidf_vectorizer
# We have implemented ngrams extration generically, following Sapkota et al. (ref [39] in the PAN 2015 overview), i.e.,
# containing punctuation marks. However, this does not apply to this study since punctuation marks are filtered-out in
# editions of Latin texts.
# More recently, it was shown that character n-grams corresponding to word affixes and including punctuation
# marks are the most significant features in cross-topic authorship attribution [57].
def _features_ngrams(documents, ns=[4, 5], ngrams_vectorizer=None, min_df = 10, preserve_punctuation=True):
def _features_ngrams(documents, ns=[4, 5], ngrams_vectorizer=None, min_df=10, preserve_punctuation=True):
"""
Extract char-ngrams
This implementation is generic, following Sapkota et al. (ref [39] in the PAN 2015 overview), i.e., containing
punctuation marks. However, this does not apply to Latin texts in which punctuation marks are filtered-out. More
recently, it was shown that character n-grams corresponding to word affixes and including punctuation marks are the
most significant features in cross-topic authorship attribution [57].
:param documents: a list where each element is the text (string) of a document
:param ns: the lenghts (n) for which n-gram frequencies will be computed
:param ngrams_vectorizer: the tfidf_vectorizer to use if already fit; if None, a new one will be instantiated and fit
:param min_df: minumum number of occurrences needed for the ngram to be taken
:param preserve_punctuation: whether or not to preserve punctuation marks
:return: see _features_tfidf
"""
doc_ngrams = ngrams_extractor(documents, ns, preserve_punctuation)
return _features_tfidf(doc_ngrams, tfidf_vectorizer=ngrams_vectorizer, min_df = min_df)
return _features_tfidf(doc_ngrams, tfidf_vectorizer=ngrams_vectorizer, min_df=min_df)
def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True):
@ -257,18 +264,29 @@ def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True):
def _feature_selection(X, y, tfidf_feat_selection_ratio):
"""
Filter-style feature selection based on Chi-squared as the term selection reduction function
:param X: a document by (sparse) features matrix
:param y: the supervised ndarray containing the class labels
:param tfidf_feat_selection_ratio: a proportion of features to be taken
:return: the reduced matrix and the feature selector fit
"""
nF = X.shape[1]
num_feats = int(tfidf_feat_selection_ratio * nF)
feature_selector = SelectKBest(chi2, k=num_feats)
X = feature_selector.fit_transform(X, y)
return X, feature_selector
def _tocsr(X):
""" Converts a dense matrix into a sparse one """
return X if issparse(X) else csr_matrix(X)
class FeatureExtractor:
"""
A feature extractor for authorship analysis applications implemented as a transformer
"""
def __init__(self,
function_words_freq=None,
conjugations_freq=None,
@ -281,21 +299,27 @@ class FeatureExtractor:
n_charngrams=[4, 5],
preserve_punctuation=True,
split_documents=False,
split_policy = split_by_endline,
split_policy=split_by_endline,
normalize_features=True,
window_size = 5,
window_size=5,
verbose=True):
"""
Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
:param path: the path containing the texts, each named as <author>_<text_name>.txt
Applies stlystic feature extraction. Features include:
:param function_words_freq: add the frequency of function words as features
:param conjugations_freq: add the frequency of regular conjugations as features
:param features_Mendenhall: add the frequencies of the words' lengths as features
:param wordngrams: add the tfidf as features
:param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
full documents, which are anyway retained).
:param features_sentenceLengths: add the frequencies of the sentences' lengths as features
:param wordngrams: add the words tfidf as features
:param tfidf_feat_selection_ratio: if less than 1, indicates the ratio of most important features (according
to chi-squared test) to be selected
:param n_wordngrams: a tuple (min,max) indicating the range of lengths for word n-grams
:param charngrams: add the char n-grams tfidf as features
:param n_charngrams: a tuple (min,max) indicating the range of lengths for char n-grams
:param preserve_punctuation: whether or not to preserver punctuation marks (should be deactivated for medieval
Latin texts)
:param split_documents: whether to split text into smaller documents or not (currently, the policy is to split by '\n').
Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace
the full documents, which are anyway retained).
:param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
:param window_size: the size of the window in case of sliding windows policy
:param verbose: show information by stdout or not
@ -321,18 +345,16 @@ class FeatureExtractor:
self.verbose = verbose
self.feature_names = None
def fit_transform(self, positives, negatives):
documents = positives + negatives
authors = [1]*len(positives) + [0]*len(negatives)
n_original_docs = len(documents)
groups = list(range(n_original_docs))
self.feature_names = []
if self.split_documents:
doc_fragments, authors_fragments, groups_fragments = splitter(documents, authors,
split_policy=self.split_policy,
window_size=self.window_size)
doc_fragments, authors_fragments, groups_fragments = splitter(
documents, authors, split_policy=self.split_policy, window_size=self.window_size
)
documents.extend(doc_fragments)
authors.extend(authors_fragments)
groups.extend(groups_fragments)
@ -342,261 +364,135 @@ class FeatureExtractor:
y = np.array(authors)
groups = np.array(groups)
# initialize the document-by-feature vector
X = np.empty((len(documents), 0))
X = self._transform(documents, y, fit=True)
# dense feature extraction functions
if self.function_words_freq:
F, f_names = _features_function_words_freq(documents, self.function_words_freq)
X = self._addfeatures(X, F)
self.feature_names.extend(f_names)
self._print('adding function words features: {} features'.format(X.shape[1]))
assert X.shape[1] == len(self.feature_names), f'wrong number of feature names, expected {X.shape[1]} found {len(self.feature_names)}'
if self.conjugations_freq:
F, f_names = _features_conjugations_freq(documents, self.conjugations_freq)
X = self._addfeatures(X, F)
self.feature_names.extend(f_names)
self._print('adding conjugation features: {} features'.format(X.shape[1]))
assert X.shape[1] == len(self.feature_names), f'wrong number of feature names, expected {X.shape[1]} found {len(self.feature_names)}'
if self.features_Mendenhall:
F, f_names = _features_Mendenhall(documents)
X = self._addfeatures(X, F)
self.feature_names.extend(f_names)
self._print('adding Mendenhall words features: {} features'.format(X.shape[1]))
assert X.shape[1] == len(self.feature_names), f'wrong number of feature names, expected {X.shape[1]} found {len(self.feature_names)}'
if self.features_sentenceLengths:
F, f_names = _features_sentenceLengths(documents)
X = self._addfeatures(X, F)
self.feature_names.extend(f_names)
self._print('adding sentence lengths features: {} features'.format(X.shape[1]))
assert X.shape[1] == len(self.feature_names), f'wrong number of feature names, expected {X.shape[1]} found {len(self.feature_names)}'
# sparse feature extraction functions
if self.tfidf:
X_features, vectorizer = _features_tfidf(documents, ngrams=self.wordngrams)
self.tfidf_vectorizer = vectorizer
index2word = {i: w for w, i in vectorizer.vocabulary_.items()}
f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))]
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
X_features, feat_sel = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
self.feat_sel_tfidf = feat_sel
f_names = [f_names[i] for i in feat_sel.get_support(indices=True)]
X = self._addfeatures(_tocsr(X), X_features)
self.feature_names.extend(f_names)
self._print('adding tfidf words features: {} features'.format(X.shape[1]))
assert X.shape[1] == len(self.feature_names), f'wrong number of feature names, expected {X.shape[1]} found {len(self.feature_names)}'
if self.ngrams:
X_features, vectorizer = _features_ngrams(documents, self.ns,
preserve_punctuation=self.preserve_punctuation)
self.ngrams_vectorizer = vectorizer
index2word = {i: w for w, i in vectorizer.vocabulary_.items()}
f_names = [f'ngram::{index2word[i]}' for i in range(len(index2word))]
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
X_features, feat_sel = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
self.feat_sel_ngrams = feat_sel
f_names = [f_names[i] for i in feat_sel.get_support(indices=True)]
X = self._addfeatures(_tocsr(X), X_features)
self.feature_names.extend(f_names)
self._print('adding ngrams character features: {} features'.format(X.shape[1]))
self.feature_names = np.asarray(self.feature_names)
assert X.shape[1] == len(self.feature_names), f'wrong number of feature names, expected {X.shape[1]} found {len(self.feature_names)}'
# print summary
if self.verbose:
print(
'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
.format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
self.split_policy.__name__))
print('number of training (full) documents: {}'.format(n_original_docs))
print('X shape (#documents,#features): {}'.format(X.shape))
print('y prevalence: {}/{} {:.2f}%'.format(y.sum(),len(y),y.mean() * 100))
f'load_documents: function_words_freq={self.function_words_freq} '
f'features_Mendenhall={self.features_Mendenhall} tfidf={self.tfidf} '
f'split_documents={self.split_documents}, split_policy={self.split_policy.__name__}'
)
print(f'number of training (full) documents: {n_original_docs}')
print(f'y prevalence: {y.sum()}/{len(y)} {y.mean() * 100:.2f}%')
print()
return X, y, groups
def transform(self, test, return_fragments=False, window_size=-1, avoid_splitting=False):
test = [test]
if window_size==-1:
if isinstance(test, str):
test = [test]
if window_size == -1:
window_size = self.window_size
if self.split_documents and not avoid_splitting:
tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size)
test.extend(tests)
# initialize the document-by-feature vector
TEST = np.empty((len(test), 0))
# dense feature extraction functions
if self.function_words_freq:
F,_=_features_function_words_freq(test, self.function_words_freq)
TEST = self._addfeatures(TEST, F)
self._print('adding function words features: {} features'.format(TEST.shape[1]))
if self.conjugations_freq:
F,_=_features_conjugations_freq(test, self.conjugations_freq)
TEST = self._addfeatures(TEST, F)
self._print('adding conjugation features: {} features'.format(TEST.shape[1]))
if self.features_Mendenhall:
F,_ = _features_Mendenhall(test)
TEST = self._addfeatures(TEST, F)
self._print('adding Mendenhall words features: {} features'.format(TEST.shape[1]))
if self.features_sentenceLengths:
F, _ = _features_sentenceLengths(test)
TEST = self._addfeatures(TEST, F)
self._print('adding sentence lengths features: {} features'.format(TEST.shape[1]))
# sparse feature extraction functions
if self.tfidf:
ep1_features, _ = _features_tfidf(test, self.tfidf_vectorizer)
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
ep1_features = self.feat_sel_tfidf.transform(ep1_features)
TEST = self._addfeatures(_tocsr(TEST), ep1_features)
self._print('adding tfidf words features: {} features'.format(TEST.shape[1]))
if self.ngrams:
ep1_features, _ = _features_ngrams(test, self.ns, ngrams_vectorizer=self.ngrams_vectorizer,
preserve_punctuation=self.preserve_punctuation)
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
ep1_features = self.feat_sel_ngrams.transform(ep1_features)
TEST = self._addfeatures(_tocsr(TEST), ep1_features)
self._print('adding ngrams words features: {} features'.format(TEST.shape[1]))
# print summary
if self.verbose:
print(
'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
.format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
self.split_policy.__name__))
print('test shape:', TEST.shape)
print()
old_verbose = self.verbose
self.verbose = False
TEST = self._transform(test, fit=False)
self.verbose = old_verbose
if return_fragments:
return TEST, test[1:]
else:
return TEST
def _addfeatures(self, X, F):
def _addfeatures(self, X, F, feat_names=None):
if self.normalize_features:
normalize(F, axis=1, copy=False)
self._register_feature_names(feat_names)
if issparse(F):
return hstack((X, F)) # sparse
else:
return np.hstack((X, F)) # dense
def _print(self, msg):
if self.verbose:
print(msg)
def _register_feature_names(self, feat_names):
""" keeps track of the feature names (for debugging and analysis) """
if feat_names is None:
return
if self.feature_names is None:
self.feature_names = []
self.feature_names.extend(feat_names)
def _transform(self, documents, y=None, fit=False):
# initialize the document-by-feature vector
X = np.empty((len(documents), 0))
# dense feature extraction functions
if self.function_words_freq:
F, f_names = _features_function_words_freq(documents, self.function_words_freq)
X = self._addfeatures(X, F, f_names if fit else None)
self._print(f'adding function words features: {X.shape[1]} features')
if __name__=='__main__':
from collections import Counter
if self.conjugations_freq:
F, f_names = _features_conjugations_freq(documents, self.conjugations_freq)
X = self._addfeatures(X, F, f_names if fit else None)
self._print(f'adding conjugation features: {X.shape[1]} features')
# text = 'Magnifico atque uictorioso domino, domino Cani Grandi de la Scala, sacratissimi cesarei principatus in urbe Uerona et ciuitate Uicentie uicario generali, deuotissimus suus Dantes Alagherii, Florentinus natione non moribus, uitam orat per tempora diuturna felicem, et gloriosi nominis perpetuum incrementum.'
text = 'Magnifico atque uictorioso domino, domino Cani Grandi de la Scala, sacratissimi cesarei principatus in urbe Uerona et ciuitate Uicentie uicario generali, deuotissimus suus Dantes Alagherii, Florentinus natione non moribus, uitam orat per tempora diuturna felicem, et gloriosi nominis perpetuum incrementum. Inclita uestre magnificentie laus, quam fama uigil uolitando disseminat, sic distrahit in diuersa diuersos, ut hos in spem sue prosperitatis attollat, hos exterminii deiciat in terrorem. Huius quidem preconium, facta modernorum exsuperans, tanquam ueri existentia latius, arbitrabar aliquando superfluum. Uerum, ne diuturna me nimis incertitudo suspenderet, uelut Austri regina Ierusalem petiit, uelut Pallas petiit Elicona, Ueronam petii fidis oculis discussurus audita, ibique magnalia uestra uidi, uidi beneficia simul et tetigi; et quemadmodum prius dictorum ex parte suspicabar excessum, sic posterius ipsa facta excessiua cognoui. Quo factum est ut ex auditu solo cum quadam animi subiectione beniuolus prius exstiterim; sed ex uisu postmodum deuotissimus et amicus. Nec reor amici nomen assumens, ut nonnulli forsitan obiectarent, reatum presumptionis incurrere, cum non minus dispares connectantur quam pares amicitie sacramento. Nam si delectabiles et utiles amicitias inspicere libeat, illis persepius inspicienti patebit, preheminentes inferioribus coniugari personas. Et si ad ueram ac per se amicitiam torqueatur intuitus, nonne illustrium summorumque principum plerunque uiros fortuna obscuros, honestate preclaros, amicos fuisse constabit? Quidni, cum etiam Dei et hominis amicitia nequaquam impediatur excessu? Quod si cuiquam, quod asseritur, nunc uideretur indignum, Spiritum Sanctum audiat, amicitie sue participes quosdam homines profitentem. Nam in Sapientia de sapientia legitur, quoniam *infinitus thesaurus est hominibus, quo qui usi sunt, participes facti sunt amicitie Dei*. Sed habet imperitia uulgi sine discretione iudicium; et quemadmodum solem pedalis magnitudinis arbitratur, sic et circa mores uana credulitate decipitur. Nos autem, quibus optimum quod est in nobis noscere datum est, gregum uestigia sectari non decet, quin ymo suis erroribus obuiare tenemur. Nam intellectu ac ratione degentes, diuina quadam libertate dotati, nullis consuetudinibus astringuntur; nec mirum, cum non ipsi legibus, sed ipsis leges potius dirigantur. Liquet igitur, quod superius dixi, me scilicet esse deuotissimum et amicum, nullatenus esse presumptum. Preferens ergo amicitiam uestram quasi thesaurum carissimum, prouidentia diligenti et accurata solicitudine illam seruare desidero. Itaque, cum in dogmatibus moralis negotii amicitiam adequari et saluari analogo doceatur, ad retribuendum pro collatis beneficiis plus quam semel analogiam sequi michi uotiuum est; et propter hoc munuscula mea sepe multum conspexi et ab inuicem segregaui, nec non segregata percensui, dignius gratiusque uobis inquirens. Neque ipsi preheminentie uestre congruum magis comperi magis quam Comedie sublimem canticam, que decoratur titulo Paradisi; et illam sub presenti epistola, tanquam sub epigrammate proprio dedicatam, uobis ascribo, uobis offero, uobis denique recommendo. Illud quoque preterire silentio simpliciter inardescens non sinit affectus, quod in hac donatione plus dono quam domino et honoris et fame conferri potest uideri.Quidni cum eius titulum iam presagiam de gloria uestri nominis ampliandum? Satis actenus uidebar expressisse quod de proposito fuit; sed zelus gratie uestre, quam sitio quasi uitam paruipendens, a primordio metam prefixam urget ulterius. Itaque, formula consumata epistole, ad introductionem oblati operis aliquid sub lectoris officio compendiose aggrediar.'
print(text)
if self.features_Mendenhall:
F, f_names = _features_Mendenhall(documents)
X = self._addfeatures(X, F, f_names if fit else None)
self._print(f'adding Mendenhall words features: {X.shape[1]} features')
# char n-grams
w=3
ngrams = [text[i:i+w].replace(' ', '_') for i in range(len(text)-w + 1)]
print('ngrams')
print(', '.join(ngrams))
print(Counter(ngrams).most_common())
if self.features_sentenceLengths:
F, f_names = _features_sentenceLengths(documents)
X = self._addfeatures(X, F, f_names if fit else None)
self._print(f'adding sentence lengths features: {X.shape[1]} features')
# word n-grams
w = 2
words = text.split()
wngrams = ['_'.join(words[i:i + w]).replace(',','') for i in range(len(words) - w + 1)]
print('\nwngrams')
print(', '.join(wngrams))
print(Counter(wngrams).most_common())
# sparse feature extraction functions
if self.tfidf:
if fit:
X_features, self.tfidf_vectorizer = _features_tfidf(documents, ngrams=self.wordngrams)
index2word = {i: w for w, i in self.tfidf_vectorizer.vocabulary_.items()}
f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))]
else:
X_features, _ = _features_tfidf(documents, self.tfidf_vectorizer)
f_names = None
fn_words = [w if w not in latin_function_words else f"{w}(*)" for w in words]
print('\nfunction words')
print(' '.join(fn_words))
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
if fit:
X_features, self.feat_sel_tfidf = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
f_names = [f_names[i] for i in self.feat_sel_tfidf.get_support(indices=True)]
else:
X_features = self.feat_sel_tfidf.transform(X_features)
X = self._addfeatures(_tocsr(X), X_features, f_names)
self._print(f'adding tfidf words features: {X.shape[1]} features')
verbal_words = []
for w in words:
lcs = sorted(latin_conjugations, key=lambda x: -len(x))
toadd = w
for lc in lcs:
if len(w) <= len(lc): continue
if w.endswith(lc):
toadd = w[:-len(lc)] + f'[{lc}]'
break
verbal_words.append(toadd)
print('\nverbal endings')
print(' '.join(verbal_words))
if self.ngrams:
if fit:
X_features, self.ngrams_vectorizer = _features_ngrams(
documents, self.ns, preserve_punctuation=self.preserve_punctuation
)
index2word = {i: w for w, i in self.ngrams_vectorizer.vocabulary_.items()}
f_names = [f'ngram::{index2word[i]}' for i in range(len(index2word))]
else:
X_features, _ = _features_ngrams(
documents, self.ns, ngrams_vectorizer=self.ngrams_vectorizer,
preserve_punctuation=self.preserve_punctuation
)
f_names = None
print('\nword lengths')
counter = Counter([len(w.replace(',','')) for w in words])
total = len(words)
x,y=[],[]
cum_req = 0
print(f'words length\tcount\tfrequency\tcumulative')
for i in range(1,24):
x.append(i)
c = counter[i]
freq = c / total
cum_req += freq
y.append(cum_req)
if c > 0:
print(f'{i}\t{c}\t{freq:.2f}\t{cum_req:.2f}')
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
if fit:
X_features, self.feat_sel_ngrams = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
f_names = [f_names[i] for i in self.feat_sel_ngrams.get_support(indices=True)]
else:
X_features = self.feat_sel_ngrams.transform(X_features)
# import matplotlib.pyplot as plt
# import seaborn as sns
# plt.plot(x, y, 'o-')
# plt.xlabel('word length')
# plt.ylabel('cumulative frequency')
# plt.title('')
# plt.grid()
# plt.show()
X = self._addfeatures(_tocsr(X), X_features, f_names)
self._print(f'adding ngrams character features: {X.shape[1]} features')
if fit:
self.feature_names = np.asarray(self.feature_names)
print('\nsentence length')
sentences = split_by_sentences(text)
counter = Counter([len(s.split()) for s in sentences])
total = len(sentences)
cum_req = 0
print(f'words length\tcount\tfrequency\tcumulative')
dots=True
rows=0
for i in range(1,70):
x.append(i)
c = counter[i]
freq = c / total
cum_req += freq
if c > 0:
print(f'{i}\t{c}\t{freq:.3f}\t{cum_req:.2f}')
dots=True
rows+=1
else:
if dots:
print(f'...\t...\t...\t...')
dots=False
print(counter)
print('rows',rows)
self._print(f'X shape (#documents,#features): {X.shape}')
return X

View File

@ -1,51 +0,0 @@
import itertools
import os
from os.path import join, isdir
PATH_PAN2015 = '../pan2015'
PAN2015_TRAIN = 'pan15-authorship-verification-training-dataset-2015-04-19'
PAN2015_TEST = 'pan15-authorship-verification-test-dataset2-2015-04-19'
class Pan2015:
def __init__(self, problem, solution):
self.problem = problem
self.solution = solution
def fetch_PAN2015(corpus, lang, base_path = PATH_PAN2015):
assert corpus in ['train','test'],'unexpected corpus request'
corpus_path = join(base_path, PAN2015_TRAIN if corpus=='train' else PAN2015_TEST)
print(corpus_path)
request = {}
truth = {}
for dir in os.listdir(corpus_path):
dir_path = join(corpus_path,dir)
if isdir(dir_path) and lang in dir:
truth = [x.split() for x in open(join(dir_path,'truth.txt'), 'rt').readlines()]
truth = {problem:1 if decision == 'Y' else 0 for problem,decision in truth}
for problem_name in os.listdir(dir_path):
problem_dir = join(dir_path,problem_name)
if isdir(problem_dir):
request[problem_name] = {}
request[problem_name]['known'] = []
for doc_name in os.listdir(problem_dir):
doc_path = join(problem_dir,doc_name)
if 'unknown.txt' == doc_name:
request[problem_name]['unknown'] = open(doc_path,'rt').read()
else:
request[problem_name]['known'].append(open(doc_path, 'rt').read())
return Pan2015(request, truth)
def TaskGenerator(request_dict):
pan_problems = request_dict.problem
problems = sorted(pan_problems.keys())
for i,problem_i in enumerate(problems):
positives = pan_problems[problem_i]['known']
negatives = list(itertools.chain.from_iterable([pan_problems[problem_j]['known'] for j,problem_j in enumerate(problems) if i!=j]))
test = pan_problems[problem_i]['unknown']
yield problem_i,positives,negatives,test,request_dict.solution[problem_i]

View File

@ -1,44 +1,15 @@
from util import disable_sklearn_warnings
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, GroupKFold, KFold, \
StratifiedKFold
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import *
from data.features import *
class RandomVerificator:
def __init__(self): pass
def fit(self,positives,negatives):
pass
def predict(self,test):
return np.random.rand()
def get_counters(true_labels, predicted_labels):
assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels."
nd = len(true_labels)
tp = np.sum(predicted_labels[true_labels == 1])
fp = np.sum(predicted_labels[true_labels == 0])
fn = np.sum(true_labels[predicted_labels == 0])
tn = nd - (tp+fp+fn)
return tp,fp,fn,tn
def f1_from_counters(tp,fp,fn,tn):
num = 2.0 * tp
den = 2.0 * tp + fp + fn
if den > 0: return num / den
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
return 1.0
def f1(true_labels, predicted_labels):
tp, fp, fn, tn = get_counters(true_labels,predicted_labels)
return f1_from_counters(tp, fp, fn, tn )
from util.evaluation import f1, get_counters
class AuthorshipVerificator:
def __init__(self, nfolds=10,
params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]},
params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]},
estimator=SVC,
author_name=None):
self.nfolds = nfolds
@ -68,11 +39,8 @@ class AuthorshipVerificator:
self.estimator.fit(X, y)
if isinstance(self.estimator, GridSearchCV):
print('Best params: {}'.format(self.estimator.best_params_))
print('computing the cross-val score')
f1scores = self.estimator.best_score_
f1_mean, f1_std = f1scores.mean(), f1scores.std()
print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores))
f1_mean = self.estimator.best_score_.mean()
print(f'Best params: {self.estimator.best_params_} (cross-validation F1={f1_mean:.3f})')
self.estimator = self.estimator.best_estimator_
return self
@ -81,11 +49,11 @@ class AuthorshipVerificator:
if groups is None:
print('Computing LOO without groups')
folds = list(LeaveOneOut().split(X,y))
folds = list(LeaveOneOut().split(X, y))
else:
print('Computing LOO with groups')
logo = LeaveOneGroupOut()
folds=list(logo.split(X,y,groups))
folds = list(logo.split(X, y, groups))
if test_lowest_index_only:
print('ignoring fragments')
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
@ -116,7 +84,7 @@ class AuthorshipVerificator:
return full_doc_prediction, None
def predict_proba(self, test, epistola_name=''):
assert self.probability, 'svm is not calibrated'
assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
pred = self.estimator.predict_proba(test)
full_doc_prediction = pred[0,1]
print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')

View File

@ -1,78 +0,0 @@
from joblib import Parallel
from joblib import delayed
from sklearn.linear_model import LogisticRegression
from util import disable_sklearn_warnings
from sklearn.svm import LinearSVC, SVC
from data.features import FeatureExtractor
from data.pan2015 import fetch_PAN2015, TaskGenerator
from model import AuthorshipVerificator
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score
def evaluation(y_pred, y_prob, y_true):
y_pred_array = np.array(y_pred)
y_prob_array = np.array(y_prob)
y_true_array = np.array(y_true)
acc = (y_pred_array == y_true_array).mean()
f1 = f1_score(y_true_array, y_pred_array)
auc = roc_auc_score(y_true_array, y_prob_array)
pan_eval = acc * auc
print('Accuracy = {:.3f}'.format(acc))
print('F1 = {:.3f}'.format(f1))
print('AUC = {:.3f}'.format(auc))
print('Acc*AUC = {:.3f}'.format(pan_eval))
print('true:', y_true)
print('pred:', y_pred)
return pan_eval
def doall(problem,pos,neg,test,truth):
print('[Start]{}'.format(problem))
feature_extractor = FeatureExtractor(function_words_freq=lang,
conjugations_freq=lang,
features_Mendenhall=True,
wordngrams=False, tfidf_feat_selection_ratio=0.1,
charngrams=True, n_charngrams=[3, 4, 5],
split_documents=False,
normalize_features=True,
verbose=True)
# method = AuthorshipVerificator(nfolds=3, estimator=LogisticRegression)
method = AuthorshipVerificator(nfolds=3, estimator=LinearSVC)
X, y = feature_extractor.fit_transform(pos, neg)
test = feature_extractor.transform(test)
method.fit(X, y)
prediction = method.predict(test)
if method.probability:
probability = method.predict_proba(test)
else:
probability = prediction
print('[End]{}'.format(problem))
return problem, probability, prediction, truth
if __name__ == '__main__':
split = 'train'
lang = 'spanish'
request = fetch_PAN2015(split, lang=lang)
with open('results_ngrams.csv', 'wt') as fo:
outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request))
y_pred, y_prob, y_true = [], [], []
for problem, probability, prediction, truth in outcomes:
fo.write('{} {}\n'.format(problem, probability))
y_pred.append(prediction)
y_prob.append(probability)
y_true.append(truth)
acc_auc = evaluation(y_pred, y_prob, y_true)
print('ACC * AUC = {:.3f}'.format(acc_auc))
print('done')

View File

@ -1,33 +0,0 @@
from matplotlib.cm import get_cmap
def color_tag(index, text, probability, cmap):
probability *= 0.6
# probability = (probability-0.5)*0.75+0.5
r,g,b,_ = cmap(probability)
# reliable = abs(probability-0.5) > 0.25*0.75
# text = '<font color="white">{}</font>'.format(text) if reliable else text
return f'<b>&nbsp;P{index}:</b> <a style="background-color:rgb({r*255:.0f},{g*255:.0f},{b*255:.0f});">{text} </a>'
def color(path, texts, probabilities, title, paragraph_offset=1):
# cmap = get_cmap('RdYlGn')
# cmap = get_cmap('Greens')
cmap = get_cmap('Greys')
with open(path, 'wt') as fo:
fo.write("""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{}</title>
</head>
<body>
<h1>{}</h1>
""".format(title,title))
for i,(line,probability) in enumerate(zip(texts,probabilities)):
fo.write(color_tag(paragraph_offset + i, line,probability,cmap))
fo.write("""
</body>
</html>
""")

View File

@ -1,35 +0,0 @@
import numpy as np
import matplotlib.pyplot as plt
# Have colormaps separated into categories:
# http://matplotlib.org/examples/color/colormaps_reference.html
cmaps = [('Diverging', ['RdYlGn','RdYlGn']),]
nrows = max(len(cmap_list) for cmap_category, cmap_list in cmaps)
gradient = np.linspace(0.25, 0.75, 256)
gradient = np.vstack((gradient, gradient))
def plot_color_gradients(cmap_category, cmap_list, nrows):
fig, axes = plt.subplots(nrows=nrows)
fig.subplots_adjust(top=0.95, bottom=0.01, left=0.2, right=0.99)
axes[0].set_title(cmap_category + ' colormaps', fontsize=14)
for ax, name in zip(axes, cmap_list):
ax.imshow(gradient, aspect='auto', cmap=plt.get_cmap(name))
pos = list(ax.get_position().bounds)
x_text = pos[0] - 0.01
y_text = pos[1] + pos[3]/2.
fig.text(x_text, y_text, name, va='center', ha='right', fontsize=10)
# Turn off *all* ticks & spines, not just the ones with colormaps.
for ax in axes:
ax.set_axis_off()
for cmap_category, cmap_list in cmaps:
plot_color_gradients(cmap_category, cmap_list, nrows)
plt.show()

View File

@ -1,3 +0,0 @@
def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn

24
src/util/evaluation.py Normal file
View File

@ -0,0 +1,24 @@
import numpy as np
def get_counters(true_labels, predicted_labels):
assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels."
nd = len(true_labels)
tp = np.sum(predicted_labels[true_labels == 1])
fp = np.sum(predicted_labels[true_labels == 0])
fn = np.sum(true_labels[predicted_labels == 0])
tn = nd - (tp+fp+fn)
return tp, fp, fn, tn
def f1_from_counters(tp, fp, fn, tn):
num = 2.0 * tp
den = 2.0 * tp + fp + fn
if den > 0: return num / den
# f1 is undefined when den==0; we define f1=1 if den==0 since the classifier has correctly classified all instances as negative
return 1.0
def f1(true_labels, predicted_labels):
tp, fp, fn, tn = get_counters(true_labels,predicted_labels)
return f1_from_counters(tp, fp, fn, tn)

View File

@ -1,59 +0,0 @@
from sklearn.linear_model import LogisticRegression
from data.dante_loader import load_texts
from data.features import *
from model import AuthorshipVerificator, f1_from_counters
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
for epistola in [2]:
author_attribution = []
print(f'Epistola {epistola}')
print('='*80)
path = f'../testi_{epistola}'
if epistola==2: path+='_tutti'
author = 'Dante'
print('=' * 80)
print('Corpus of Epistola {}'.format(epistola))
print('=' * 80)
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=f'EpistolaXIII_{epistola}.txt')
n_full_docs = len(positive) + len(negative)
feature_extractor = FeatureExtractor(function_words_freq='latin',
conjugations_freq='latin',
features_Mendenhall=True,
features_sentenceLengths=True,
tfidf_feat_selection_ratio=0.1,
wordngrams=True, n_wordngrams=(1, 2),
charngrams=True, n_charngrams=(3, 4, 5),
preserve_punctuation=False,
split_documents=True, split_policy=split_by_sentences, window_size=3,
normalize_features=True)
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
print('Fitting the Verificator')
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
av.fit(Xtr, ytr, groups)
feat_rank = np.argsort(av.estimator.coef_[0])
coef_ordered = av.estimator.coef_[0][feat_rank]
feat_name_ordered = feature_extractor.feature_names[feat_rank]
print('Most Dantesque features::')
for i in range(100):
print(f'{i}: {feat_name_ordered[::-1][i]} {coef_ordered[::-1][i]:.3f}')
print('\nMost Non-Dantesque features::')
for i in range(100):
print(f'{i}: {feat_name_ordered[i]} {coef_ordered[i]:.3f}')
print('done')