cleaning
This commit is contained in:
parent
b4796f4882
commit
b1376026c4
|
|
@ -1,159 +0,0 @@
|
|||
from mpl_toolkits.axes_grid1 import make_axes_locatable
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from data.dante_loader import load_texts
|
||||
from data.features import *
|
||||
from model import AuthorshipVerificator, f1_from_counters
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5,5), label_offset=0.3):
|
||||
|
||||
attributions = attributions.T
|
||||
print(attributions.shape)
|
||||
# attributions=attributions>0.5
|
||||
paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[0]-1)]
|
||||
|
||||
fig, ax = plt.subplots(figsize=figsize)
|
||||
|
||||
# im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greens')
|
||||
im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greys')
|
||||
|
||||
# Create colorbar
|
||||
# cbar = fig.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1, pad=0.04)
|
||||
# ax.figure.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1)
|
||||
# ax.figure.colorbar(im, ax=ax, orientation="horizontal", pad=0.05)
|
||||
|
||||
# cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
|
||||
|
||||
# We want to show all ticks...
|
||||
# ax.set_xticks(np.arange(len(authors)))
|
||||
ax.set_xticks(np.arange(len(authors) + 0) + label_offset)
|
||||
ax.set_yticks(np.arange(len(paragraphs)))
|
||||
# ... and label them with the respective list entries
|
||||
ax.set_xticklabels(authors)
|
||||
ax.set_yticklabels(paragraphs)
|
||||
|
||||
ax.tick_params(top=False, bottom=False,
|
||||
labeltop=True, labelbottom=False)
|
||||
|
||||
# Rotate the tick labels and set their alignment.
|
||||
plt.setp(ax.get_xticklabels(), rotation=90, ha="left", rotation_mode="anchor")
|
||||
|
||||
for edge, spine in ax.spines.items():
|
||||
spine.set_visible(False)
|
||||
|
||||
ax.set_xticks(np.arange(len(authors)+1) - .5, minor=True)
|
||||
ax.set_yticks(np.arange(len(paragraphs)+1) - .5, minor=True)
|
||||
|
||||
ax.grid(which="minor", color="k", linestyle='-', linewidth=1)
|
||||
ax.tick_params(which="minor", bottom=False, left=False)
|
||||
|
||||
# Loop over data dimensions and create text annotations.
|
||||
# for i in range(len(authors)):
|
||||
# for j in range(len(paragraphs)):
|
||||
# text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
|
||||
|
||||
# ax.set_title("Attribution matrix")
|
||||
fig.tight_layout()
|
||||
# plt.show()
|
||||
plt.savefig(path)
|
||||
|
||||
# import sys
|
||||
# for epistola in [1]:
|
||||
# if epistola == 1:
|
||||
# authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||
# paragraph_offset = 1
|
||||
# figsize=(3,9)
|
||||
# label_offset=0.2
|
||||
#
|
||||
# else:
|
||||
# authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
|
||||
# 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
||||
# 'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||
# 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
||||
# 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
|
||||
# 'PietroAlighieri', 'RaimundusLullus',
|
||||
# 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
||||
# paragraph_offset = 14
|
||||
# figsize = (6,20)
|
||||
# label_offset=0.3
|
||||
#
|
||||
# attributions = np.load(f'attribution_ep{epistola}.npy')
|
||||
# plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
|
||||
# sys.exit(0)
|
||||
|
||||
for epistola in [1]:
|
||||
|
||||
author_attribution = []
|
||||
print(f'Epistola {epistola}')
|
||||
print('='*80)
|
||||
path = f'../testi_{epistola}'
|
||||
|
||||
if epistola == 1:
|
||||
authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||
paragraphs = range(1,14)
|
||||
|
||||
else:
|
||||
authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
|
||||
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
||||
'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
||||
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
|
||||
'PietroAlighieri', 'RaimundusLullus',
|
||||
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
||||
paragraphs = range(14, 91)
|
||||
assert len(authors)==20, f'unexpected number of authors ({len(authors)})'
|
||||
|
||||
|
||||
discarded = 0
|
||||
f1_scores = []
|
||||
counters = []
|
||||
for i, author in enumerate(authors):
|
||||
print('=' * 80)
|
||||
print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
|
||||
print('Corpus of Epistola {}'.format(epistola))
|
||||
print('=' * 80)
|
||||
|
||||
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
|
||||
positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target)
|
||||
# if len(positive) < 2:
|
||||
# discarded += 1
|
||||
# continue
|
||||
|
||||
n_full_docs = len(positive) + len(negative)
|
||||
|
||||
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||
conjugations_freq='latin',
|
||||
features_Mendenhall=True,
|
||||
features_sentenceLengths=True,
|
||||
tfidf_feat_selection_ratio=0.1,
|
||||
wordngrams=True, n_wordngrams=(1, 2),
|
||||
charngrams=True, n_charngrams=(3, 4, 5),
|
||||
preserve_punctuation=False,
|
||||
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||
normalize_features=True)
|
||||
|
||||
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||
|
||||
print('Fitting the Verificator')
|
||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
|
||||
av.fit(Xtr, ytr, groups)
|
||||
|
||||
attributions=[]
|
||||
for i,target_text in enumerate(ep_texts):
|
||||
ep = feature_extractor.transform(target_text, avoid_splitting=True)
|
||||
prob,_ = av.predict_proba(ep, epistola_name=target[i])
|
||||
attributions.append(prob)
|
||||
author_attribution.append(attributions)
|
||||
|
||||
author_attribution = np.asarray(author_attribution)
|
||||
attribution_path = f'attribution_ep{epistola}.npy'
|
||||
print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}')
|
||||
np.save(attribution_path, author_attribution)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,151 +0,0 @@
|
|||
from mpl_toolkits.axes_grid1 import make_axes_locatable
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from data.dante_loader import load_texts
|
||||
from data.features import *
|
||||
from model import AuthorshipVerificator, f1_from_counters
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5,5), label_offset=0.3):
|
||||
|
||||
attributions = attributions.T
|
||||
print(attributions.shape)
|
||||
# attributions=attributions>0.5
|
||||
paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[0]-1)]
|
||||
|
||||
fig, ax = plt.subplots(figsize=figsize)
|
||||
|
||||
# im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greens')
|
||||
im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greys')
|
||||
|
||||
# Create colorbar
|
||||
# cbar = fig.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1, pad=0.04)
|
||||
# ax.figure.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1)
|
||||
# ax.figure.colorbar(im, ax=ax, orientation="horizontal", pad=0.05)
|
||||
|
||||
# cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
|
||||
|
||||
# We want to show all ticks...
|
||||
# ax.set_xticks(np.arange(len(authors)))
|
||||
ax.set_xticks(np.arange(len(authors) + 0) + label_offset)
|
||||
ax.set_yticks(np.arange(len(paragraphs)))
|
||||
# ... and label them with the respective list entries
|
||||
ax.set_xticklabels(authors)
|
||||
ax.set_yticklabels(paragraphs)
|
||||
|
||||
ax.tick_params(top=False, bottom=False,
|
||||
labeltop=True, labelbottom=False)
|
||||
|
||||
# Rotate the tick labels and set their alignment.
|
||||
plt.setp(ax.get_xticklabels(), rotation=90, ha="left", rotation_mode="anchor")
|
||||
|
||||
for edge, spine in ax.spines.items():
|
||||
spine.set_visible(False)
|
||||
|
||||
ax.set_xticks(np.arange(len(authors)+1) - .5, minor=True)
|
||||
ax.set_yticks(np.arange(len(paragraphs)+1) - .5, minor=True)
|
||||
|
||||
ax.grid(which="minor", color="k", linestyle='-', linewidth=1)
|
||||
ax.tick_params(which="minor", bottom=False, left=False)
|
||||
|
||||
# Loop over data dimensions and create text annotations.
|
||||
# for i in range(len(authors)):
|
||||
# for j in range(len(paragraphs)):
|
||||
# text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w")
|
||||
|
||||
# ax.set_title("Attribution matrix")
|
||||
fig.tight_layout()
|
||||
# plt.show()
|
||||
plt.savefig(path)
|
||||
|
||||
import sys
|
||||
authors1 = ['ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||
authors2 = ['BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna',
|
||||
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
||||
'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia',
|
||||
'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet',
|
||||
'PietroAlighieri', 'RaimundusLullus',
|
||||
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
||||
authors3 = sorted(np.unique(authors1 + authors2).tolist())
|
||||
|
||||
for epistola in [1]:
|
||||
paragraph_offset = 1
|
||||
label_offset = 0.2
|
||||
if epistola == 1:
|
||||
authors = ['Dante'] + authors1
|
||||
figsize = (4, 4)
|
||||
elif epistola == 2:
|
||||
authors = ['Dante'] + authors2
|
||||
figsize = (6, 4)
|
||||
else:
|
||||
authors = ['Dante'] + authors3
|
||||
|
||||
attributions = np.load(f'attribution_ep{epistola}_xiv.npy')
|
||||
plot_attribution(f'plot{epistola}_xiv.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset)
|
||||
sys.exit(0)
|
||||
|
||||
for epistola in [1]:
|
||||
|
||||
author_attribution = []
|
||||
print(f'Epistola {epistola}')
|
||||
print('='*80)
|
||||
path = f'../testiXIV_{epistola}'
|
||||
|
||||
|
||||
if epistola == 1:
|
||||
authors = ['Dante'] + authors1
|
||||
elif epistola == 2:
|
||||
authors = ['Dante'] + authors2
|
||||
else:
|
||||
authors = ['Dante'] + authors3
|
||||
|
||||
discarded = 0
|
||||
f1_scores = []
|
||||
counters = []
|
||||
for i, author in enumerate(authors):
|
||||
print('=' * 80)
|
||||
print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
|
||||
print('Corpus of Epistola {}'.format(epistola))
|
||||
print('=' * 80)
|
||||
|
||||
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)]
|
||||
positive, negative, _, _, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
|
||||
|
||||
n_full_docs = len(positive) + len(negative)
|
||||
|
||||
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||
conjugations_freq='latin',
|
||||
features_Mendenhall=True,
|
||||
features_sentenceLengths=True,
|
||||
tfidf_feat_selection_ratio=0.1,
|
||||
wordngrams=True, n_wordngrams=(1, 2),
|
||||
charngrams=True, n_charngrams=(3, 4, 5),
|
||||
preserve_punctuation=False,
|
||||
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||
normalize_features=True)
|
||||
|
||||
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||
|
||||
print('Fitting the Verificator')
|
||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
|
||||
av.fit(Xtr, ytr, groups)
|
||||
|
||||
attributions=[]
|
||||
for i,target_text in enumerate(ep_texts):
|
||||
ep = feature_extractor.transform(target_text, avoid_splitting=True)
|
||||
prob,_ = av.predict_proba(ep, epistola_name=target[i])
|
||||
attributions.append(prob)
|
||||
author_attribution.append(attributions)
|
||||
|
||||
author_attribution = np.asarray(author_attribution)
|
||||
attribution_path = f'attribution_ep{epistola}_xiv.npy'
|
||||
print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}')
|
||||
np.save(attribution_path, author_attribution)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,42 +1,38 @@
|
|||
from sklearn.linear_model import LogisticRegression
|
||||
from data.dante_loader import load_texts
|
||||
from data.dante_loader import load_latin_corpus, list_authors
|
||||
from data.features import *
|
||||
from model import AuthorshipVerificator, f1_from_counters
|
||||
from model import AuthorshipVerificator
|
||||
from util.evaluation import f1_from_counters
|
||||
import argparse
|
||||
|
||||
AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
|
||||
AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
|
||||
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', 'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', 'IohannesDePlanoCarpini',
|
||||
'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus',
|
||||
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
|
||||
|
||||
for epistola in [1]:
|
||||
if epistola==1:
|
||||
authors = ['Dante','ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba','PierDellaVigna']
|
||||
else:
|
||||
authors = ['Dante', 'BeneFlorentinus','BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
|
||||
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio',
|
||||
'GrazioloBambaglioli', 'GuidoDaPisa',
|
||||
'GuidoDeColumnis', 'GuidoFaba','IacobusDeVaragine','IohannesDeAppia',
|
||||
'IohannesDePlanoCarpini','IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna',
|
||||
'PietroAlighieri', 'RaimundusLullus',
|
||||
'RyccardusDeSanctoGermano','ZonoDeMagnalis']
|
||||
|
||||
def main():
|
||||
discarded = 0
|
||||
f1_scores = []
|
||||
counters = []
|
||||
for i,author in enumerate(authors):
|
||||
for i, author in enumerate(args.authors):
|
||||
path = args.corpuspath
|
||||
print('='*80)
|
||||
print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
|
||||
print('Corpus of Epistola {}'.format(epistola))
|
||||
print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})')
|
||||
print(f'Corpus {path}')
|
||||
print('='*80)
|
||||
path = '../testi_{}'.format(epistola)
|
||||
if epistola==2:
|
||||
path+='_interaEpistola'
|
||||
|
||||
positive, negative, pos_files, neg_files, ep_text = load_texts(
|
||||
path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola)
|
||||
positive, negative, pos_files, neg_files, ep_text = load_latin_corpus(
|
||||
path, positive_author=author, unknown_target=args.unknown
|
||||
)
|
||||
files = np.asarray(pos_files + neg_files)
|
||||
if len(positive) < 2:
|
||||
discarded+=1
|
||||
discarded += 1
|
||||
continue
|
||||
|
||||
n_full_docs = len(positive) + len(negative)
|
||||
print(f'read {n_full_docs} documents from {path}')
|
||||
|
||||
feature_extractor = FeatureExtractor(
|
||||
function_words_freq='latin',
|
||||
|
|
@ -53,33 +49,63 @@ for epistola in [1]:
|
|||
normalize_features=True
|
||||
)
|
||||
|
||||
|
||||
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
||||
print(ytr)
|
||||
|
||||
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
|
||||
|
||||
print('Fitting the Verificator')
|
||||
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
|
||||
av.fit(Xtr,ytr,groups)
|
||||
if args.unknown:
|
||||
print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
|
||||
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
|
||||
|
||||
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(
|
||||
Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
|
||||
)
|
||||
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
|
||||
counters.append((tp, fp, fn, tn))
|
||||
print('F1 for {author} = {f1_scores[-1]:.3f}')
|
||||
print('Fitting the Verificator')
|
||||
av.fit(Xtr, ytr, groups)
|
||||
av.predict_proba(ep, args.unknown)
|
||||
|
||||
if args.loo:
|
||||
print('Validating the Verificator (Leave-One-Out)')
|
||||
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(
|
||||
Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
|
||||
)
|
||||
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
|
||||
counters.append((tp, fp, fn, tn))
|
||||
print(f'F1 for {author} = {f1_scores[-1]:.3f}')
|
||||
|
||||
if args.loo:
|
||||
print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
|
||||
f1_scores = np.array(f1_scores)
|
||||
counters = np.array(counters)
|
||||
|
||||
macro_f1 = f1_scores.mean()
|
||||
micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
|
||||
|
||||
print(f'Macro-F1 = {macro_f1:.3f}')
|
||||
print(f'Micro-F1 = {micro_f1:.3f}')
|
||||
print()
|
||||
|
||||
|
||||
print(f'Computing macro- and micro-averages (discarded {discarded}/{len(authors)})')
|
||||
f1_scores = np.array(f1_scores)
|
||||
counters = np.array(counters)
|
||||
if __name__ == '__main__':
|
||||
import os
|
||||
|
||||
macro_f1 = f1_scores.mean()
|
||||
micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
|
||||
# Training settings
|
||||
parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII')
|
||||
parser.add_argument('corpuspath', type=str, metavar='PATH',
|
||||
help=f'Path to the directory containing the corpus (documents must be named <author>_<texname>.txt')
|
||||
parser.add_argument('positive', type=str, default="Dante",
|
||||
help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check every author')
|
||||
parser.add_argument('--loo', default=False, action='store_true',
|
||||
help='submit each binary classifier to leave-one-out validation')
|
||||
parser.add_argument('--unknown', type=str, default=None,
|
||||
help='path to the file of unknown paternity (default None)')
|
||||
|
||||
print(f'Macro-F1 = {macro_f1:.3f}')
|
||||
print(f'Micro-F1 = {micro_f1:.3f}')
|
||||
print()
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.positive == 'ALL':
|
||||
args.authors = list_authors(args.corpuspath, skip_prefix='Epistola')
|
||||
else:
|
||||
if (args.positive not in AUTHORS_CORPUS_I) and (args.positive in AUTHORS_CORPUS_II):
|
||||
print(f'warning: author {args.positive} is not in the known list of authors for CORPUS I nor CORPUS II')
|
||||
assert args.positive in list_authors(args.corpuspath, skip_prefix='Epistola'), 'unexpected author'
|
||||
args.authors = [args.positive]
|
||||
|
||||
assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.'
|
||||
assert args.unknown is None or os.path.exists(args.unknown), 'unknown file does not exist'
|
||||
|
||||
main()
|
||||
|
|
@ -1,89 +0,0 @@
|
|||
from sklearn.linear_model import LogisticRegression
|
||||
from data.dante_loader import load_texts
|
||||
from data.features import *
|
||||
from model import AuthorshipVerificator, f1_from_counters
|
||||
from sklearn.svm import LinearSVC, SVC
|
||||
from util.color_visualization import color
|
||||
import pickle
|
||||
import os
|
||||
|
||||
# DONE: ngrams should contain punctuation marks according to Sapkota et al. [39] in the PAN 2015 overview
|
||||
# (More recently, it was shown that character
|
||||
# n-grams corresponding to word affixes and including punctuation marks are the most
|
||||
# significant features in cross-topic authorship attribution [57].) #we have cancelled the
|
||||
# TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection
|
||||
# TODO: sentence length (Mendenhall-style) ?
|
||||
from src.data.features import FeatureExtractor
|
||||
|
||||
for epistola in [1,2]:
|
||||
|
||||
print('Epistola {}'.format(epistola))
|
||||
print('='*80)
|
||||
path = '../testi_{}'.format(epistola)
|
||||
if epistola==1:
|
||||
paragraphs = range(1, 14)
|
||||
if epistola==2:
|
||||
paragraphs = range(14, 91)
|
||||
|
||||
target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_new.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs]
|
||||
positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target)
|
||||
|
||||
pickle_file = f'../dante_color/epistola{epistola}.pkl'
|
||||
if os.path.exists(pickle_file):
|
||||
print(f'loading pickle file {pickle_file}')
|
||||
probabilities = pickle.load(open(pickle_file, 'rb'))
|
||||
for prob,text in zip(probabilities,ep_texts):
|
||||
text = text.replace('\n','')
|
||||
print(f"{prob:.3f}:{text}")
|
||||
print(f'media={np.asarray(probabilities[1:]).mean()}')
|
||||
else:
|
||||
print(f'generating pickle file')
|
||||
n_full_docs = len(positive) + len(negative)
|
||||
|
||||
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||
conjugations_freq='latin',
|
||||
features_Mendenhall=True,
|
||||
features_sentenceLengths=True,
|
||||
tfidf_feat_selection_ratio=0.1,
|
||||
wordngrams=True, n_wordngrams=(1, 2),
|
||||
charngrams=True, n_charngrams=(3, 4, 5),
|
||||
preserve_punctuation=False,
|
||||
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||
normalize_features=True)
|
||||
|
||||
|
||||
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
||||
print(ytr)
|
||||
|
||||
print('Fitting the Verificator')
|
||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name='Dante')
|
||||
av.fit(Xtr,ytr,groups)
|
||||
|
||||
probabilities = []
|
||||
for i, target_text in enumerate(ep_texts):
|
||||
ep = feature_extractor.transform(target_text, avoid_splitting=True)
|
||||
prob, _ = av.predict_proba(ep, epistola_name=target[i])
|
||||
probabilities.append(prob)
|
||||
|
||||
pickle.dump(probabilities, open(pickle_file, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
color(path=f'../dante_color/epistola{epistola}.html', texts=ep_texts, probabilities=probabilities, title=f'Epistola {("I" if epistola==1 else "II")}', paragraph_offset=paragraphs[0])
|
||||
|
||||
|
||||
# print('Predicting the Epistola {}'.format(epistola))
|
||||
# title = 'Epistola {}'.format('I' if epistola==1 else 'II')
|
||||
# av.predict(ep, title)
|
||||
# fulldoc_prob, fragment_probs = av.predict_proba(ep, title)
|
||||
# color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title)
|
||||
|
||||
# score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False)
|
||||
# print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||
|
||||
# score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True)
|
||||
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||
# f1_ = f1_from_counters(tp, fp, fn, tn)
|
||||
# print('F1 = {:.3f}'.format(f1_))
|
||||
|
||||
# score_ave, score_std = av.leave_one_out(Xtr, ytr, None)
|
||||
# print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||
|
||||
|
|
@ -1,75 +0,0 @@
|
|||
from sklearn.linear_model import LogisticRegression
|
||||
from data.dante_loader import load_texts
|
||||
from data.features import *
|
||||
from model import AuthorshipVerificator, f1_from_counters
|
||||
from sklearn.svm import LinearSVC, SVC
|
||||
from util.color_visualization import color
|
||||
import pickle
|
||||
import os
|
||||
|
||||
for epistola in [1,2,3]: #3 means "both Ep1 and Ep2 corpora"
|
||||
|
||||
print('Epistola {}'.format(epistola))
|
||||
print('='*80)
|
||||
path = '../testiXIV_{}'.format(epistola)
|
||||
paragraphs = range(1, 6)
|
||||
|
||||
target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs]
|
||||
positive, negative, _, _, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII')
|
||||
|
||||
pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl'
|
||||
if os.path.exists(pickle_file):
|
||||
print(f'loading pickle file {pickle_file}')
|
||||
probabilities = pickle.load(open(pickle_file, 'rb'))
|
||||
else:
|
||||
print(f'generating pickle file')
|
||||
n_full_docs = len(positive) + len(negative)
|
||||
|
||||
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||
conjugations_freq='latin',
|
||||
features_Mendenhall=True,
|
||||
features_sentenceLengths=True,
|
||||
tfidf_feat_selection_ratio=0.1,
|
||||
wordngrams=True, n_wordngrams=(1, 2),
|
||||
charngrams=True, n_charngrams=(3, 4, 5),
|
||||
preserve_punctuation=False,
|
||||
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||
normalize_features=True)
|
||||
|
||||
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
||||
print(ytr)
|
||||
|
||||
print('Fitting the Verificator')
|
||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name='Dante')
|
||||
av.fit(Xtr,ytr,groups)
|
||||
|
||||
probabilities = []
|
||||
for i, target_text in enumerate(ep_texts):
|
||||
ep = feature_extractor.transform(target_text, avoid_splitting=True)
|
||||
prob, _ = av.predict_proba(ep, epistola_name=target[i])
|
||||
probabilities.append(prob)
|
||||
|
||||
pickle.dump(probabilities, open(pickle_file, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
color(path=f'../dante_color/epistola{epistola}_xiv.html', texts=ep_texts,
|
||||
probabilities=probabilities, title=f'Epistola {epistola}',
|
||||
paragraph_offset=paragraphs[0])
|
||||
|
||||
|
||||
# print('Predicting the Epistola {}'.format(epistola))
|
||||
# title = 'Epistola {}'.format('I' if epistola==1 else 'II')
|
||||
# av.predict(ep, title)
|
||||
# fulldoc_prob, fragment_probs = av.predict_proba(ep, title)
|
||||
# color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title)
|
||||
|
||||
# score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False)
|
||||
# print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||
|
||||
# score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True)
|
||||
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||
# f1_ = f1_from_counters(tp, fp, fn, tn)
|
||||
# print('F1 = {:.3f}'.format(f1_))
|
||||
|
||||
# score_ave, score_std = av.leave_one_out(Xtr, ytr, None)
|
||||
# print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||
|
||||
|
|
@ -1,14 +1,11 @@
|
|||
import os
|
||||
from os.path import join
|
||||
import re
|
||||
import collections
|
||||
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# document loading routine
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
def remove_pattern(doc, start_symbol, end_symbol, counter):
|
||||
assert counter[start_symbol] == counter[end_symbol], 'wrong number of {}{} found'.format(start_symbol,end_symbol)
|
||||
search = True
|
||||
|
|
@ -21,6 +18,7 @@ def remove_pattern(doc, start_symbol, end_symbol, counter):
|
|||
search = False
|
||||
return doc
|
||||
|
||||
|
||||
# removes citations in format:
|
||||
# *latino*
|
||||
# {volgare}
|
||||
|
|
@ -30,16 +28,30 @@ def remove_citations(doc):
|
|||
doc = remove_pattern(doc, start_symbol='{', end_symbol='}', counter=counter)
|
||||
return doc
|
||||
|
||||
def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_prefix='EpistolaXIII_'):
|
||||
|
||||
def load_latin_corpus(path, positive_author='Dante', unknown_target=None, train_skip_prefix='Epistola'):
|
||||
"""
|
||||
Function used to load the Corpus I and Corpus II for authorship verification (and validation) of the Epistola XIII.
|
||||
The corpus is assumed to contain files named according to <author>_<text_name>.txt.
|
||||
:param path: the path containing the texts, each named as <author>_<text_name>.txt
|
||||
:param positive_author: the author that defines the positive class for verification
|
||||
:param unknown_target: if specified, is the path to the unknown document whose paternity is to be check (w.r.t.
|
||||
the positive_author)
|
||||
:param train_skip_prefix: specify a prefix for documents that should be skipped
|
||||
:return: a tuple containing the positive documents, negative documents, paths to positive documents, paths to
|
||||
negative documents, and the unknown document if that was specified (otherwise an empty list)
|
||||
"""
|
||||
# load the training data (all documents but Epistolas 1 and 2)
|
||||
positive, negative = [], []
|
||||
files_positive, files_negative = [], []
|
||||
authors = []
|
||||
|
||||
authors = []
|
||||
ndocs=0
|
||||
for file in os.listdir(path):
|
||||
if file.startswith(train_skip_prefix): continue
|
||||
file_clean = file.replace('.txt','')
|
||||
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
|
||||
if f'{path}/{file}' == unknown_target: continue
|
||||
file_name = file.replace('.txt','')
|
||||
author, textname = file_name.split('_')
|
||||
text = open(join(path,file), encoding= "utf8").read()
|
||||
text = remove_citations(text)
|
||||
|
||||
|
|
@ -50,40 +62,21 @@ def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_pr
|
|||
negative.append(text)
|
||||
files_negative.append(file)
|
||||
authors.append(author)
|
||||
ndocs+=1
|
||||
ndocs += 1
|
||||
|
||||
# load the test data (Epistolas 1 and 2)
|
||||
# load the unknown document (if requested))
|
||||
if unknown_target:
|
||||
if isinstance(unknown_target, str):
|
||||
unknown_target = [unknown_target]
|
||||
unknowns = []
|
||||
for unknown_text in unknown_target:
|
||||
unknown = open(join(path, unknown_text), encoding="utf8").read()
|
||||
unknown = remove_citations(unknown)
|
||||
unknowns.append(unknown)
|
||||
if len(unknowns) == 1: unknowns = unknowns[0]
|
||||
return positive, negative, files_positive, files_negative, unknowns
|
||||
|
||||
unknown = open(unknown_target, encoding="utf8").read()
|
||||
unknown = [remove_citations(unknown)]
|
||||
else:
|
||||
return positive, negative, files_positive, files_negative
|
||||
|
||||
|
||||
def ___list_texts(path):
|
||||
authors = {}
|
||||
for file in os.listdir(path):
|
||||
if file.startswith('EpistolaXIII_'): continue
|
||||
file_clean = file.replace('.txt','')
|
||||
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
|
||||
if author not in authors:
|
||||
authors[author] = []
|
||||
authors[author].append(textname)
|
||||
|
||||
author_order = sorted(authors.keys())
|
||||
for author in author_order:
|
||||
print('{}:\t{}'.format(author,', '.join(authors[author])))
|
||||
|
||||
unknown = []
|
||||
return positive, negative, files_positive, files_negative, unknown
|
||||
|
||||
|
||||
def list_authors(path, skip_prefix, skip_authors=['Misc']):
|
||||
authors = [file.split('_')[0] for file in os.listdir(path) if not file.startswith(skip_prefix)]
|
||||
authors = [author for author in authors if author not in skip_authors]
|
||||
return sorted(set(authors))
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -39,64 +39,55 @@ latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amus
|
|||
'sis', 'sit', 'simus', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemus', 'essetis', 'essent',
|
||||
'fui', 'fuisti', 'fuit', 'fuimus', 'fuistis', 'fuerunt', 'este', 'esto', 'estote', 'sunto']
|
||||
|
||||
spanish_conjugations = ['o','as','a','amos','áis','an','es','e','emos','éis','en','imos','ís','guir','ger','gir',
|
||||
'ar', 'er', 'ir', 'é', 'aste', 'ó','asteis','aron','í','iste','ió','isteis','ieron',
|
||||
'aba', 'abas', 'ábamos', 'aban', 'ía', 'ías', 'íamos', 'íais', 'ían', 'ás','á',
|
||||
'án','estoy','estás','está','estamos','estáis','están']
|
||||
|
||||
|
||||
def get_function_words(lang):
|
||||
if lang=='latin':
|
||||
if lang == 'latin':
|
||||
return latin_function_words
|
||||
elif lang in ['english','spanish']:
|
||||
return stopwords.words(lang)
|
||||
else:
|
||||
raise ValueError('{} not in scope!'.format(lang))
|
||||
|
||||
|
||||
def get_conjugations(lang):
|
||||
if lang == 'latin':
|
||||
return latin_conjugations
|
||||
elif lang == 'spanish':
|
||||
return spanish_conjugations
|
||||
else:
|
||||
raise ValueError('conjugations for languages other than Latin and Spanish are not yet supported')
|
||||
raise ValueError('conjugations for languages other than Latin are not yet supported')
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# split policies
|
||||
# ------------------------------------------------------------------------
|
||||
# TODO: implement other split policies (e.g., overlapping ones, etc)
|
||||
def split_by_endline(text):
|
||||
return [t.strip() for t in text.split('\n') if t.strip()]
|
||||
|
||||
|
||||
def split_by_sentences(text):
|
||||
sentences = [t.strip() for t in nltk.tokenize.sent_tokenize(text) if t.strip()]
|
||||
#sentences= [t.strip() for t in re.split(r"\.|\?|\!\;", text) if t.strip()]
|
||||
|
||||
for i,sentence in enumerate(sentences):
|
||||
unmod_tokens = nltk.tokenize.word_tokenize(sentence)
|
||||
mod_tokens = ([token for token in unmod_tokens if any(char.isalpha() for char in token)])
|
||||
if len(mod_tokens)<8:
|
||||
if i<len(sentences)-1:
|
||||
if i < len(sentences)-1:
|
||||
sentences[i+1] = sentences[i] + ' ' + sentences[i+1]
|
||||
else:
|
||||
sentences[i-1] = sentences[i-1] + ' ' + sentences[i]
|
||||
sentences.pop(i)
|
||||
|
||||
return sentences
|
||||
|
||||
|
||||
def windows(text_fragments, window_size):
|
||||
new_fragments = []
|
||||
nbatches = len(text_fragments) // window_size
|
||||
if len(text_fragments) % window_size > 0:
|
||||
nbatches+=1
|
||||
# for i in range(len(text_fragments)-window_size+1):
|
||||
nbatches += 1
|
||||
for i in range(nbatches):
|
||||
offset = i*window_size
|
||||
new_fragments.append(' '.join(text_fragments[offset:offset+window_size]))
|
||||
return new_fragments
|
||||
|
||||
|
||||
def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1):
|
||||
fragments = []
|
||||
authors_fragments = []
|
||||
|
|
@ -117,7 +108,7 @@ def splitter(documents, authors=None, split_policy=split_by_sentences, window_si
|
|||
|
||||
def tokenize(text):
|
||||
unmod_tokens = nltk.word_tokenize(text)
|
||||
return ([token.lower() for token in unmod_tokens if any(char.isalpha() for char in token)])
|
||||
return [token.lower() for token in unmod_tokens if any(char.isalpha() for char in token)]
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
|
|
@ -125,7 +116,7 @@ def tokenize(text):
|
|||
# ------------------------------------------------------------------------
|
||||
def _features_function_words_freq(documents, lang):
|
||||
"""
|
||||
Extract features as the frequency (x1000) of the function words used in the documents
|
||||
Extract features as the frequency (L1x1000) of the function words used in the documents
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(function_words)
|
||||
"""
|
||||
|
|
@ -145,6 +136,12 @@ def _features_function_words_freq(documents, lang):
|
|||
|
||||
|
||||
def _features_conjugations_freq(documents, lang):
|
||||
"""
|
||||
Extract features as the frequency (L1x1000) of the conjugations used in the documents. The method is heuristic, and
|
||||
actually searches for suffixes contained in the conjugation list.
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(conjugations)
|
||||
"""
|
||||
features = []
|
||||
conjugations = get_conjugations(lang)
|
||||
|
||||
|
|
@ -152,7 +149,9 @@ def _features_conjugations_freq(documents, lang):
|
|||
mod_tokens = tokenize(text)
|
||||
conjugation_tokens = []
|
||||
for conjugation in conjugations:
|
||||
conjugation_tokens.extend([conjugation for token in mod_tokens if token.endswith(conjugation) and len(token)>len(conjugation)])
|
||||
conjugation_tokens.extend(
|
||||
[conjugation for token in mod_tokens if token.endswith(conjugation) and len(token) > len(conjugation)]
|
||||
)
|
||||
freqs = nltk.FreqDist(conjugation_tokens)
|
||||
nwords = len(mod_tokens)
|
||||
conjugation_freq = [1000. * freqs[conjugation] / nwords for conjugation in conjugations]
|
||||
|
|
@ -165,7 +164,7 @@ def _features_conjugations_freq(documents, lang):
|
|||
|
||||
def _features_Mendenhall(documents, upto=23):
|
||||
"""
|
||||
Extract features as the frequency (x1000) of the words' lengths used in the documents,
|
||||
Extract features as the frequency (L1x1000) of the words' lengths used in the documents,
|
||||
following the idea behind Mendenhall's Characteristic Curve of Composition
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
:return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
|
||||
|
|
@ -213,7 +212,7 @@ def _features_sentenceLengths(documents, downto=3, upto=70):
|
|||
return np.array(features), f_names
|
||||
|
||||
|
||||
def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)):
|
||||
def _features_tfidf(documents, tfidf_vectorizer=None, min_df=1, ngrams=(1, 1)):
|
||||
"""
|
||||
Extract features as tfidf matrix extracted from the documents
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
|
|
@ -229,14 +228,22 @@ def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)):
|
|||
return features, tfidf_vectorizer
|
||||
|
||||
|
||||
# We have implemented ngrams extration generically, following Sapkota et al. (ref [39] in the PAN 2015 overview), i.e.,
|
||||
# containing punctuation marks. However, this does not apply to this study since punctuation marks are filtered-out in
|
||||
# editions of Latin texts.
|
||||
# More recently, it was shown that character n-grams corresponding to word affixes and including punctuation
|
||||
# marks are the most significant features in cross-topic authorship attribution [57].
|
||||
def _features_ngrams(documents, ns=[4, 5], ngrams_vectorizer=None, min_df = 10, preserve_punctuation=True):
|
||||
def _features_ngrams(documents, ns=[4, 5], ngrams_vectorizer=None, min_df=10, preserve_punctuation=True):
|
||||
"""
|
||||
Extract char-ngrams
|
||||
This implementation is generic, following Sapkota et al. (ref [39] in the PAN 2015 overview), i.e., containing
|
||||
punctuation marks. However, this does not apply to Latin texts in which punctuation marks are filtered-out. More
|
||||
recently, it was shown that character n-grams corresponding to word affixes and including punctuation marks are the
|
||||
most significant features in cross-topic authorship attribution [57].
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
:param ns: the lenghts (n) for which n-gram frequencies will be computed
|
||||
:param ngrams_vectorizer: the tfidf_vectorizer to use if already fit; if None, a new one will be instantiated and fit
|
||||
:param min_df: minumum number of occurrences needed for the ngram to be taken
|
||||
:param preserve_punctuation: whether or not to preserve punctuation marks
|
||||
:return: see _features_tfidf
|
||||
"""
|
||||
doc_ngrams = ngrams_extractor(documents, ns, preserve_punctuation)
|
||||
return _features_tfidf(doc_ngrams, tfidf_vectorizer=ngrams_vectorizer, min_df = min_df)
|
||||
return _features_tfidf(doc_ngrams, tfidf_vectorizer=ngrams_vectorizer, min_df=min_df)
|
||||
|
||||
|
||||
def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True):
|
||||
|
|
@ -257,18 +264,29 @@ def ngrams_extractor(documents, ns=[4, 5], preserve_punctuation=True):
|
|||
|
||||
|
||||
def _feature_selection(X, y, tfidf_feat_selection_ratio):
|
||||
"""
|
||||
Filter-style feature selection based on Chi-squared as the term selection reduction function
|
||||
:param X: a document by (sparse) features matrix
|
||||
:param y: the supervised ndarray containing the class labels
|
||||
:param tfidf_feat_selection_ratio: a proportion of features to be taken
|
||||
:return: the reduced matrix and the feature selector fit
|
||||
"""
|
||||
nF = X.shape[1]
|
||||
num_feats = int(tfidf_feat_selection_ratio * nF)
|
||||
feature_selector = SelectKBest(chi2, k=num_feats)
|
||||
X = feature_selector.fit_transform(X, y)
|
||||
return X, feature_selector
|
||||
|
||||
|
||||
def _tocsr(X):
|
||||
""" Converts a dense matrix into a sparse one """
|
||||
return X if issparse(X) else csr_matrix(X)
|
||||
|
||||
|
||||
class FeatureExtractor:
|
||||
|
||||
"""
|
||||
A feature extractor for authorship analysis applications implemented as a transformer
|
||||
"""
|
||||
def __init__(self,
|
||||
function_words_freq=None,
|
||||
conjugations_freq=None,
|
||||
|
|
@ -281,21 +299,27 @@ class FeatureExtractor:
|
|||
n_charngrams=[4, 5],
|
||||
preserve_punctuation=True,
|
||||
split_documents=False,
|
||||
split_policy = split_by_endline,
|
||||
split_policy=split_by_endline,
|
||||
normalize_features=True,
|
||||
window_size = 5,
|
||||
window_size=5,
|
||||
verbose=True):
|
||||
"""
|
||||
Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
|
||||
contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
|
||||
EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
|
||||
:param path: the path containing the texts, each named as <author>_<text_name>.txt
|
||||
Applies stlystic feature extraction. Features include:
|
||||
:param function_words_freq: add the frequency of function words as features
|
||||
:param conjugations_freq: add the frequency of regular conjugations as features
|
||||
:param features_Mendenhall: add the frequencies of the words' lengths as features
|
||||
:param wordngrams: add the tfidf as features
|
||||
:param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
|
||||
Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
|
||||
full documents, which are anyway retained).
|
||||
:param features_sentenceLengths: add the frequencies of the sentences' lengths as features
|
||||
:param wordngrams: add the words tfidf as features
|
||||
:param tfidf_feat_selection_ratio: if less than 1, indicates the ratio of most important features (according
|
||||
to chi-squared test) to be selected
|
||||
:param n_wordngrams: a tuple (min,max) indicating the range of lengths for word n-grams
|
||||
:param charngrams: add the char n-grams tfidf as features
|
||||
:param n_charngrams: a tuple (min,max) indicating the range of lengths for char n-grams
|
||||
:param preserve_punctuation: whether or not to preserver punctuation marks (should be deactivated for medieval
|
||||
Latin texts)
|
||||
:param split_documents: whether to split text into smaller documents or not (currently, the policy is to split by '\n').
|
||||
Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace
|
||||
the full documents, which are anyway retained).
|
||||
:param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
|
||||
:param window_size: the size of the window in case of sliding windows policy
|
||||
:param verbose: show information by stdout or not
|
||||
|
|
@ -321,18 +345,16 @@ class FeatureExtractor:
|
|||
self.verbose = verbose
|
||||
self.feature_names = None
|
||||
|
||||
|
||||
def fit_transform(self, positives, negatives):
|
||||
documents = positives + negatives
|
||||
authors = [1]*len(positives) + [0]*len(negatives)
|
||||
n_original_docs = len(documents)
|
||||
groups = list(range(n_original_docs))
|
||||
self.feature_names = []
|
||||
|
||||
if self.split_documents:
|
||||
doc_fragments, authors_fragments, groups_fragments = splitter(documents, authors,
|
||||
split_policy=self.split_policy,
|
||||
window_size=self.window_size)
|
||||
doc_fragments, authors_fragments, groups_fragments = splitter(
|
||||
documents, authors, split_policy=self.split_policy, window_size=self.window_size
|
||||
)
|
||||
documents.extend(doc_fragments)
|
||||
authors.extend(authors_fragments)
|
||||
groups.extend(groups_fragments)
|
||||
|
|
@ -342,261 +364,135 @@ class FeatureExtractor:
|
|||
y = np.array(authors)
|
||||
groups = np.array(groups)
|
||||
|
||||
# initialize the document-by-feature vector
|
||||
X = np.empty((len(documents), 0))
|
||||
X = self._transform(documents, y, fit=True)
|
||||
|
||||
# dense feature extraction functions
|
||||
if self.function_words_freq:
|
||||
F, f_names = _features_function_words_freq(documents, self.function_words_freq)
|
||||
X = self._addfeatures(X, F)
|
||||
self.feature_names.extend(f_names)
|
||||
self._print('adding function words features: {} features'.format(X.shape[1]))
|
||||
assert X.shape[1] == len(self.feature_names), f'wrong number of feature names, expected {X.shape[1]} found {len(self.feature_names)}'
|
||||
|
||||
if self.conjugations_freq:
|
||||
F, f_names = _features_conjugations_freq(documents, self.conjugations_freq)
|
||||
X = self._addfeatures(X, F)
|
||||
self.feature_names.extend(f_names)
|
||||
self._print('adding conjugation features: {} features'.format(X.shape[1]))
|
||||
assert X.shape[1] == len(self.feature_names), f'wrong number of feature names, expected {X.shape[1]} found {len(self.feature_names)}'
|
||||
|
||||
if self.features_Mendenhall:
|
||||
F, f_names = _features_Mendenhall(documents)
|
||||
X = self._addfeatures(X, F)
|
||||
self.feature_names.extend(f_names)
|
||||
self._print('adding Mendenhall words features: {} features'.format(X.shape[1]))
|
||||
assert X.shape[1] == len(self.feature_names), f'wrong number of feature names, expected {X.shape[1]} found {len(self.feature_names)}'
|
||||
|
||||
if self.features_sentenceLengths:
|
||||
F, f_names = _features_sentenceLengths(documents)
|
||||
X = self._addfeatures(X, F)
|
||||
self.feature_names.extend(f_names)
|
||||
self._print('adding sentence lengths features: {} features'.format(X.shape[1]))
|
||||
assert X.shape[1] == len(self.feature_names), f'wrong number of feature names, expected {X.shape[1]} found {len(self.feature_names)}'
|
||||
|
||||
# sparse feature extraction functions
|
||||
if self.tfidf:
|
||||
X_features, vectorizer = _features_tfidf(documents, ngrams=self.wordngrams)
|
||||
self.tfidf_vectorizer = vectorizer
|
||||
index2word = {i: w for w, i in vectorizer.vocabulary_.items()}
|
||||
f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))]
|
||||
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
X_features, feat_sel = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
|
||||
self.feat_sel_tfidf = feat_sel
|
||||
f_names = [f_names[i] for i in feat_sel.get_support(indices=True)]
|
||||
|
||||
X = self._addfeatures(_tocsr(X), X_features)
|
||||
self.feature_names.extend(f_names)
|
||||
self._print('adding tfidf words features: {} features'.format(X.shape[1]))
|
||||
|
||||
assert X.shape[1] == len(self.feature_names), f'wrong number of feature names, expected {X.shape[1]} found {len(self.feature_names)}'
|
||||
if self.ngrams:
|
||||
X_features, vectorizer = _features_ngrams(documents, self.ns,
|
||||
preserve_punctuation=self.preserve_punctuation)
|
||||
self.ngrams_vectorizer = vectorizer
|
||||
index2word = {i: w for w, i in vectorizer.vocabulary_.items()}
|
||||
f_names = [f'ngram::{index2word[i]}' for i in range(len(index2word))]
|
||||
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
X_features, feat_sel = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
|
||||
self.feat_sel_ngrams = feat_sel
|
||||
f_names = [f_names[i] for i in feat_sel.get_support(indices=True)]
|
||||
|
||||
X = self._addfeatures(_tocsr(X), X_features)
|
||||
self.feature_names.extend(f_names)
|
||||
self._print('adding ngrams character features: {} features'.format(X.shape[1]))
|
||||
|
||||
self.feature_names = np.asarray(self.feature_names)
|
||||
|
||||
assert X.shape[1] == len(self.feature_names), f'wrong number of feature names, expected {X.shape[1]} found {len(self.feature_names)}'
|
||||
# print summary
|
||||
if self.verbose:
|
||||
print(
|
||||
'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
|
||||
.format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
|
||||
self.split_policy.__name__))
|
||||
print('number of training (full) documents: {}'.format(n_original_docs))
|
||||
print('X shape (#documents,#features): {}'.format(X.shape))
|
||||
print('y prevalence: {}/{} {:.2f}%'.format(y.sum(),len(y),y.mean() * 100))
|
||||
f'load_documents: function_words_freq={self.function_words_freq} '
|
||||
f'features_Mendenhall={self.features_Mendenhall} tfidf={self.tfidf} '
|
||||
f'split_documents={self.split_documents}, split_policy={self.split_policy.__name__}'
|
||||
)
|
||||
print(f'number of training (full) documents: {n_original_docs}')
|
||||
print(f'y prevalence: {y.sum()}/{len(y)} {y.mean() * 100:.2f}%')
|
||||
print()
|
||||
|
||||
return X, y, groups
|
||||
|
||||
|
||||
def transform(self, test, return_fragments=False, window_size=-1, avoid_splitting=False):
|
||||
test = [test]
|
||||
if window_size==-1:
|
||||
if isinstance(test, str):
|
||||
test = [test]
|
||||
if window_size == -1:
|
||||
window_size = self.window_size
|
||||
|
||||
if self.split_documents and not avoid_splitting:
|
||||
tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size)
|
||||
test.extend(tests)
|
||||
|
||||
# initialize the document-by-feature vector
|
||||
TEST = np.empty((len(test), 0))
|
||||
|
||||
# dense feature extraction functions
|
||||
if self.function_words_freq:
|
||||
F,_=_features_function_words_freq(test, self.function_words_freq)
|
||||
TEST = self._addfeatures(TEST, F)
|
||||
self._print('adding function words features: {} features'.format(TEST.shape[1]))
|
||||
|
||||
if self.conjugations_freq:
|
||||
F,_=_features_conjugations_freq(test, self.conjugations_freq)
|
||||
TEST = self._addfeatures(TEST, F)
|
||||
self._print('adding conjugation features: {} features'.format(TEST.shape[1]))
|
||||
|
||||
if self.features_Mendenhall:
|
||||
F,_ = _features_Mendenhall(test)
|
||||
TEST = self._addfeatures(TEST, F)
|
||||
self._print('adding Mendenhall words features: {} features'.format(TEST.shape[1]))
|
||||
|
||||
if self.features_sentenceLengths:
|
||||
F, _ = _features_sentenceLengths(test)
|
||||
TEST = self._addfeatures(TEST, F)
|
||||
self._print('adding sentence lengths features: {} features'.format(TEST.shape[1]))
|
||||
|
||||
# sparse feature extraction functions
|
||||
if self.tfidf:
|
||||
ep1_features, _ = _features_tfidf(test, self.tfidf_vectorizer)
|
||||
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
ep1_features = self.feat_sel_tfidf.transform(ep1_features)
|
||||
|
||||
TEST = self._addfeatures(_tocsr(TEST), ep1_features)
|
||||
self._print('adding tfidf words features: {} features'.format(TEST.shape[1]))
|
||||
|
||||
if self.ngrams:
|
||||
ep1_features, _ = _features_ngrams(test, self.ns, ngrams_vectorizer=self.ngrams_vectorizer,
|
||||
preserve_punctuation=self.preserve_punctuation)
|
||||
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
ep1_features = self.feat_sel_ngrams.transform(ep1_features)
|
||||
|
||||
TEST = self._addfeatures(_tocsr(TEST), ep1_features)
|
||||
self._print('adding ngrams words features: {} features'.format(TEST.shape[1]))
|
||||
|
||||
# print summary
|
||||
if self.verbose:
|
||||
print(
|
||||
'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
|
||||
.format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
|
||||
self.split_policy.__name__))
|
||||
print('test shape:', TEST.shape)
|
||||
print()
|
||||
old_verbose = self.verbose
|
||||
self.verbose = False
|
||||
TEST = self._transform(test, fit=False)
|
||||
self.verbose = old_verbose
|
||||
|
||||
if return_fragments:
|
||||
return TEST, test[1:]
|
||||
else:
|
||||
return TEST
|
||||
|
||||
|
||||
def _addfeatures(self, X, F):
|
||||
def _addfeatures(self, X, F, feat_names=None):
|
||||
if self.normalize_features:
|
||||
normalize(F, axis=1, copy=False)
|
||||
self._register_feature_names(feat_names)
|
||||
|
||||
if issparse(F):
|
||||
return hstack((X, F)) # sparse
|
||||
else:
|
||||
return np.hstack((X, F)) # dense
|
||||
|
||||
|
||||
def _print(self, msg):
|
||||
if self.verbose:
|
||||
print(msg)
|
||||
|
||||
def _register_feature_names(self, feat_names):
|
||||
""" keeps track of the feature names (for debugging and analysis) """
|
||||
if feat_names is None:
|
||||
return
|
||||
if self.feature_names is None:
|
||||
self.feature_names = []
|
||||
self.feature_names.extend(feat_names)
|
||||
|
||||
def _transform(self, documents, y=None, fit=False):
|
||||
# initialize the document-by-feature vector
|
||||
X = np.empty((len(documents), 0))
|
||||
|
||||
# dense feature extraction functions
|
||||
if self.function_words_freq:
|
||||
F, f_names = _features_function_words_freq(documents, self.function_words_freq)
|
||||
X = self._addfeatures(X, F, f_names if fit else None)
|
||||
self._print(f'adding function words features: {X.shape[1]} features')
|
||||
|
||||
if __name__=='__main__':
|
||||
from collections import Counter
|
||||
if self.conjugations_freq:
|
||||
F, f_names = _features_conjugations_freq(documents, self.conjugations_freq)
|
||||
X = self._addfeatures(X, F, f_names if fit else None)
|
||||
self._print(f'adding conjugation features: {X.shape[1]} features')
|
||||
|
||||
# text = 'Magnifico atque uictorioso domino, domino Cani Grandi de la Scala, sacratissimi cesarei principatus in urbe Uerona et ciuitate Uicentie uicario generali, deuotissimus suus Dantes Alagherii, Florentinus natione non moribus, uitam orat per tempora diuturna felicem, et gloriosi nominis perpetuum incrementum.'
|
||||
text = 'Magnifico atque uictorioso domino, domino Cani Grandi de la Scala, sacratissimi cesarei principatus in urbe Uerona et ciuitate Uicentie uicario generali, deuotissimus suus Dantes Alagherii, Florentinus natione non moribus, uitam orat per tempora diuturna felicem, et gloriosi nominis perpetuum incrementum. Inclita uestre magnificentie laus, quam fama uigil uolitando disseminat, sic distrahit in diuersa diuersos, ut hos in spem sue prosperitatis attollat, hos exterminii deiciat in terrorem. Huius quidem preconium, facta modernorum exsuperans, tanquam ueri existentia latius, arbitrabar aliquando superfluum. Uerum, ne diuturna me nimis incertitudo suspenderet, uelut Austri regina Ierusalem petiit, uelut Pallas petiit Elicona, Ueronam petii fidis oculis discussurus audita, ibique magnalia uestra uidi, uidi beneficia simul et tetigi; et quemadmodum prius dictorum ex parte suspicabar excessum, sic posterius ipsa facta excessiua cognoui. Quo factum est ut ex auditu solo cum quadam animi subiectione beniuolus prius exstiterim; sed ex uisu postmodum deuotissimus et amicus. Nec reor amici nomen assumens, ut nonnulli forsitan obiectarent, reatum presumptionis incurrere, cum non minus dispares connectantur quam pares amicitie sacramento. Nam si delectabiles et utiles amicitias inspicere libeat, illis persepius inspicienti patebit, preheminentes inferioribus coniugari personas. Et si ad ueram ac per se amicitiam torqueatur intuitus, nonne illustrium summorumque principum plerunque uiros fortuna obscuros, honestate preclaros, amicos fuisse constabit? Quidni, cum etiam Dei et hominis amicitia nequaquam impediatur excessu? Quod si cuiquam, quod asseritur, nunc uideretur indignum, Spiritum Sanctum audiat, amicitie sue participes quosdam homines profitentem. Nam in Sapientia de sapientia legitur, quoniam *infinitus thesaurus est hominibus, quo qui usi sunt, participes facti sunt amicitie Dei*. Sed habet imperitia uulgi sine discretione iudicium; et quemadmodum solem pedalis magnitudinis arbitratur, sic et circa mores uana credulitate decipitur. Nos autem, quibus optimum quod est in nobis noscere datum est, gregum uestigia sectari non decet, quin ymo suis erroribus obuiare tenemur. Nam intellectu ac ratione degentes, diuina quadam libertate dotati, nullis consuetudinibus astringuntur; nec mirum, cum non ipsi legibus, sed ipsis leges potius dirigantur. Liquet igitur, quod superius dixi, me scilicet esse deuotissimum et amicum, nullatenus esse presumptum. Preferens ergo amicitiam uestram quasi thesaurum carissimum, prouidentia diligenti et accurata solicitudine illam seruare desidero. Itaque, cum in dogmatibus moralis negotii amicitiam adequari et saluari analogo doceatur, ad retribuendum pro collatis beneficiis plus quam semel analogiam sequi michi uotiuum est; et propter hoc munuscula mea sepe multum conspexi et ab inuicem segregaui, nec non segregata percensui, dignius gratiusque uobis inquirens. Neque ipsi preheminentie uestre congruum magis comperi magis quam Comedie sublimem canticam, que decoratur titulo Paradisi; et illam sub presenti epistola, tanquam sub epigrammate proprio dedicatam, uobis ascribo, uobis offero, uobis denique recommendo. Illud quoque preterire silentio simpliciter inardescens non sinit affectus, quod in hac donatione plus dono quam domino et honoris et fame conferri potest uideri.Quidni cum eius titulum iam presagiam de gloria uestri nominis ampliandum? Satis actenus uidebar expressisse quod de proposito fuit; sed zelus gratie uestre, quam sitio quasi uitam paruipendens, a primordio metam prefixam urget ulterius. Itaque, formula consumata epistole, ad introductionem oblati operis aliquid sub lectoris officio compendiose aggrediar.'
|
||||
print(text)
|
||||
if self.features_Mendenhall:
|
||||
F, f_names = _features_Mendenhall(documents)
|
||||
X = self._addfeatures(X, F, f_names if fit else None)
|
||||
self._print(f'adding Mendenhall words features: {X.shape[1]} features')
|
||||
|
||||
# char n-grams
|
||||
w=3
|
||||
ngrams = [text[i:i+w].replace(' ', '_') for i in range(len(text)-w + 1)]
|
||||
print('ngrams')
|
||||
print(', '.join(ngrams))
|
||||
print(Counter(ngrams).most_common())
|
||||
if self.features_sentenceLengths:
|
||||
F, f_names = _features_sentenceLengths(documents)
|
||||
X = self._addfeatures(X, F, f_names if fit else None)
|
||||
self._print(f'adding sentence lengths features: {X.shape[1]} features')
|
||||
|
||||
# word n-grams
|
||||
w = 2
|
||||
words = text.split()
|
||||
wngrams = ['_'.join(words[i:i + w]).replace(',','') for i in range(len(words) - w + 1)]
|
||||
print('\nwngrams')
|
||||
print(', '.join(wngrams))
|
||||
print(Counter(wngrams).most_common())
|
||||
# sparse feature extraction functions
|
||||
if self.tfidf:
|
||||
if fit:
|
||||
X_features, self.tfidf_vectorizer = _features_tfidf(documents, ngrams=self.wordngrams)
|
||||
index2word = {i: w for w, i in self.tfidf_vectorizer.vocabulary_.items()}
|
||||
f_names = [f'tfidf::{index2word[i]}' for i in range(len(index2word))]
|
||||
else:
|
||||
X_features, _ = _features_tfidf(documents, self.tfidf_vectorizer)
|
||||
f_names = None
|
||||
|
||||
fn_words = [w if w not in latin_function_words else f"{w}(*)" for w in words]
|
||||
print('\nfunction words')
|
||||
print(' '.join(fn_words))
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
if fit:
|
||||
X_features, self.feat_sel_tfidf = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
|
||||
f_names = [f_names[i] for i in self.feat_sel_tfidf.get_support(indices=True)]
|
||||
else:
|
||||
X_features = self.feat_sel_tfidf.transform(X_features)
|
||||
X = self._addfeatures(_tocsr(X), X_features, f_names)
|
||||
self._print(f'adding tfidf words features: {X.shape[1]} features')
|
||||
|
||||
verbal_words = []
|
||||
for w in words:
|
||||
lcs = sorted(latin_conjugations, key=lambda x: -len(x))
|
||||
toadd = w
|
||||
for lc in lcs:
|
||||
if len(w) <= len(lc): continue
|
||||
if w.endswith(lc):
|
||||
toadd = w[:-len(lc)] + f'[{lc}]'
|
||||
break
|
||||
verbal_words.append(toadd)
|
||||
print('\nverbal endings')
|
||||
print(' '.join(verbal_words))
|
||||
if self.ngrams:
|
||||
if fit:
|
||||
X_features, self.ngrams_vectorizer = _features_ngrams(
|
||||
documents, self.ns, preserve_punctuation=self.preserve_punctuation
|
||||
)
|
||||
index2word = {i: w for w, i in self.ngrams_vectorizer.vocabulary_.items()}
|
||||
f_names = [f'ngram::{index2word[i]}' for i in range(len(index2word))]
|
||||
else:
|
||||
X_features, _ = _features_ngrams(
|
||||
documents, self.ns, ngrams_vectorizer=self.ngrams_vectorizer,
|
||||
preserve_punctuation=self.preserve_punctuation
|
||||
)
|
||||
f_names = None
|
||||
|
||||
print('\nword lengths')
|
||||
counter = Counter([len(w.replace(',','')) for w in words])
|
||||
total = len(words)
|
||||
x,y=[],[]
|
||||
cum_req = 0
|
||||
print(f'words length\tcount\tfrequency\tcumulative')
|
||||
for i in range(1,24):
|
||||
x.append(i)
|
||||
c = counter[i]
|
||||
freq = c / total
|
||||
cum_req += freq
|
||||
y.append(cum_req)
|
||||
if c > 0:
|
||||
print(f'{i}\t{c}\t{freq:.2f}\t{cum_req:.2f}')
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
if fit:
|
||||
X_features, self.feat_sel_ngrams = _feature_selection(X_features, y, self.tfidf_feat_selection_ratio)
|
||||
f_names = [f_names[i] for i in self.feat_sel_ngrams.get_support(indices=True)]
|
||||
else:
|
||||
X_features = self.feat_sel_ngrams.transform(X_features)
|
||||
|
||||
# import matplotlib.pyplot as plt
|
||||
# import seaborn as sns
|
||||
# plt.plot(x, y, 'o-')
|
||||
# plt.xlabel('word length')
|
||||
# plt.ylabel('cumulative frequency')
|
||||
# plt.title('')
|
||||
# plt.grid()
|
||||
# plt.show()
|
||||
X = self._addfeatures(_tocsr(X), X_features, f_names)
|
||||
self._print(f'adding ngrams character features: {X.shape[1]} features')
|
||||
|
||||
if fit:
|
||||
self.feature_names = np.asarray(self.feature_names)
|
||||
|
||||
print('\nsentence length')
|
||||
sentences = split_by_sentences(text)
|
||||
counter = Counter([len(s.split()) for s in sentences])
|
||||
total = len(sentences)
|
||||
cum_req = 0
|
||||
print(f'words length\tcount\tfrequency\tcumulative')
|
||||
dots=True
|
||||
rows=0
|
||||
for i in range(1,70):
|
||||
x.append(i)
|
||||
c = counter[i]
|
||||
freq = c / total
|
||||
cum_req += freq
|
||||
if c > 0:
|
||||
print(f'{i}\t{c}\t{freq:.3f}\t{cum_req:.2f}')
|
||||
dots=True
|
||||
rows+=1
|
||||
else:
|
||||
if dots:
|
||||
print(f'...\t...\t...\t...')
|
||||
dots=False
|
||||
print(counter)
|
||||
print('rows',rows)
|
||||
self._print(f'X shape (#documents,#features): {X.shape}')
|
||||
|
||||
return X
|
||||
|
|
|
|||
|
|
@ -1,51 +0,0 @@
|
|||
import itertools
|
||||
import os
|
||||
from os.path import join, isdir
|
||||
|
||||
PATH_PAN2015 = '../pan2015'
|
||||
PAN2015_TRAIN = 'pan15-authorship-verification-training-dataset-2015-04-19'
|
||||
PAN2015_TEST = 'pan15-authorship-verification-test-dataset2-2015-04-19'
|
||||
|
||||
class Pan2015:
|
||||
def __init__(self, problem, solution):
|
||||
self.problem = problem
|
||||
self.solution = solution
|
||||
|
||||
def fetch_PAN2015(corpus, lang, base_path = PATH_PAN2015):
|
||||
assert corpus in ['train','test'],'unexpected corpus request'
|
||||
|
||||
corpus_path = join(base_path, PAN2015_TRAIN if corpus=='train' else PAN2015_TEST)
|
||||
|
||||
print(corpus_path)
|
||||
request = {}
|
||||
truth = {}
|
||||
for dir in os.listdir(corpus_path):
|
||||
dir_path = join(corpus_path,dir)
|
||||
if isdir(dir_path) and lang in dir:
|
||||
truth = [x.split() for x in open(join(dir_path,'truth.txt'), 'rt').readlines()]
|
||||
truth = {problem:1 if decision == 'Y' else 0 for problem,decision in truth}
|
||||
for problem_name in os.listdir(dir_path):
|
||||
problem_dir = join(dir_path,problem_name)
|
||||
if isdir(problem_dir):
|
||||
request[problem_name] = {}
|
||||
request[problem_name]['known'] = []
|
||||
for doc_name in os.listdir(problem_dir):
|
||||
doc_path = join(problem_dir,doc_name)
|
||||
if 'unknown.txt' == doc_name:
|
||||
request[problem_name]['unknown'] = open(doc_path,'rt').read()
|
||||
else:
|
||||
request[problem_name]['known'].append(open(doc_path, 'rt').read())
|
||||
|
||||
return Pan2015(request, truth)
|
||||
|
||||
def TaskGenerator(request_dict):
|
||||
pan_problems = request_dict.problem
|
||||
problems = sorted(pan_problems.keys())
|
||||
for i,problem_i in enumerate(problems):
|
||||
positives = pan_problems[problem_i]['known']
|
||||
negatives = list(itertools.chain.from_iterable([pan_problems[problem_j]['known'] for j,problem_j in enumerate(problems) if i!=j]))
|
||||
test = pan_problems[problem_i]['unknown']
|
||||
yield problem_i,positives,negatives,test,request_dict.solution[problem_i]
|
||||
|
||||
|
||||
|
||||
48
src/model.py
48
src/model.py
|
|
@ -1,44 +1,15 @@
|
|||
from util import disable_sklearn_warnings
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.metrics import make_scorer
|
||||
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, GroupKFold, KFold, \
|
||||
StratifiedKFold
|
||||
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.svm import *
|
||||
from data.features import *
|
||||
|
||||
class RandomVerificator:
|
||||
def __init__(self): pass
|
||||
def fit(self,positives,negatives):
|
||||
pass
|
||||
def predict(self,test):
|
||||
return np.random.rand()
|
||||
|
||||
def get_counters(true_labels, predicted_labels):
|
||||
assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels."
|
||||
nd = len(true_labels)
|
||||
tp = np.sum(predicted_labels[true_labels == 1])
|
||||
fp = np.sum(predicted_labels[true_labels == 0])
|
||||
fn = np.sum(true_labels[predicted_labels == 0])
|
||||
tn = nd - (tp+fp+fn)
|
||||
return tp,fp,fn,tn
|
||||
|
||||
def f1_from_counters(tp,fp,fn,tn):
|
||||
num = 2.0 * tp
|
||||
den = 2.0 * tp + fp + fn
|
||||
if den > 0: return num / den
|
||||
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
|
||||
return 1.0
|
||||
|
||||
def f1(true_labels, predicted_labels):
|
||||
tp, fp, fn, tn = get_counters(true_labels,predicted_labels)
|
||||
return f1_from_counters(tp, fp, fn, tn )
|
||||
from util.evaluation import f1, get_counters
|
||||
|
||||
|
||||
class AuthorshipVerificator:
|
||||
|
||||
def __init__(self, nfolds=10,
|
||||
params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]},
|
||||
params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]},
|
||||
estimator=SVC,
|
||||
author_name=None):
|
||||
self.nfolds = nfolds
|
||||
|
|
@ -68,11 +39,8 @@ class AuthorshipVerificator:
|
|||
self.estimator.fit(X, y)
|
||||
|
||||
if isinstance(self.estimator, GridSearchCV):
|
||||
print('Best params: {}'.format(self.estimator.best_params_))
|
||||
print('computing the cross-val score')
|
||||
f1scores = self.estimator.best_score_
|
||||
f1_mean, f1_std = f1scores.mean(), f1scores.std()
|
||||
print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores))
|
||||
f1_mean = self.estimator.best_score_.mean()
|
||||
print(f'Best params: {self.estimator.best_params_} (cross-validation F1={f1_mean:.3f})')
|
||||
self.estimator = self.estimator.best_estimator_
|
||||
|
||||
return self
|
||||
|
|
@ -81,11 +49,11 @@ class AuthorshipVerificator:
|
|||
|
||||
if groups is None:
|
||||
print('Computing LOO without groups')
|
||||
folds = list(LeaveOneOut().split(X,y))
|
||||
folds = list(LeaveOneOut().split(X, y))
|
||||
else:
|
||||
print('Computing LOO with groups')
|
||||
logo = LeaveOneGroupOut()
|
||||
folds=list(logo.split(X,y,groups))
|
||||
folds = list(logo.split(X, y, groups))
|
||||
if test_lowest_index_only:
|
||||
print('ignoring fragments')
|
||||
folds = [(train, np.min(test, keepdims=True)) for train, test in folds]
|
||||
|
|
@ -116,7 +84,7 @@ class AuthorshipVerificator:
|
|||
return full_doc_prediction, None
|
||||
|
||||
def predict_proba(self, test, epistola_name=''):
|
||||
assert self.probability, 'svm is not calibrated'
|
||||
assert hasattr(self, 'predict_proba'), 'the classifier is not calibrated'
|
||||
pred = self.estimator.predict_proba(test)
|
||||
full_doc_prediction = pred[0,1]
|
||||
print(f'{epistola_name} is from {self.author_name} with Probability {full_doc_prediction:.3f}')
|
||||
|
|
|
|||
|
|
@ -1,78 +0,0 @@
|
|||
from joblib import Parallel
|
||||
from joblib import delayed
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from util import disable_sklearn_warnings
|
||||
from sklearn.svm import LinearSVC, SVC
|
||||
from data.features import FeatureExtractor
|
||||
from data.pan2015 import fetch_PAN2015, TaskGenerator
|
||||
from model import AuthorshipVerificator
|
||||
import numpy as np
|
||||
from sklearn.metrics import f1_score, roc_auc_score
|
||||
|
||||
def evaluation(y_pred, y_prob, y_true):
|
||||
y_pred_array = np.array(y_pred)
|
||||
y_prob_array = np.array(y_prob)
|
||||
y_true_array = np.array(y_true)
|
||||
|
||||
acc = (y_pred_array == y_true_array).mean()
|
||||
f1 = f1_score(y_true_array, y_pred_array)
|
||||
auc = roc_auc_score(y_true_array, y_prob_array)
|
||||
pan_eval = acc * auc
|
||||
|
||||
print('Accuracy = {:.3f}'.format(acc))
|
||||
print('F1 = {:.3f}'.format(f1))
|
||||
print('AUC = {:.3f}'.format(auc))
|
||||
print('Acc*AUC = {:.3f}'.format(pan_eval))
|
||||
print('true:', y_true)
|
||||
print('pred:', y_pred)
|
||||
|
||||
return pan_eval
|
||||
|
||||
|
||||
def doall(problem,pos,neg,test,truth):
|
||||
print('[Start]{}'.format(problem))
|
||||
feature_extractor = FeatureExtractor(function_words_freq=lang,
|
||||
conjugations_freq=lang,
|
||||
features_Mendenhall=True,
|
||||
wordngrams=False, tfidf_feat_selection_ratio=0.1,
|
||||
charngrams=True, n_charngrams=[3, 4, 5],
|
||||
split_documents=False,
|
||||
normalize_features=True,
|
||||
verbose=True)
|
||||
|
||||
# method = AuthorshipVerificator(nfolds=3, estimator=LogisticRegression)
|
||||
method = AuthorshipVerificator(nfolds=3, estimator=LinearSVC)
|
||||
|
||||
X, y = feature_extractor.fit_transform(pos, neg)
|
||||
test = feature_extractor.transform(test)
|
||||
|
||||
method.fit(X, y)
|
||||
prediction = method.predict(test)
|
||||
if method.probability:
|
||||
probability = method.predict_proba(test)
|
||||
else:
|
||||
probability = prediction
|
||||
|
||||
print('[End]{}'.format(problem))
|
||||
return problem, probability, prediction, truth
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
split = 'train'
|
||||
lang = 'spanish'
|
||||
request = fetch_PAN2015(split, lang=lang)
|
||||
|
||||
with open('results_ngrams.csv', 'wt') as fo:
|
||||
outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request))
|
||||
y_pred, y_prob, y_true = [], [], []
|
||||
for problem, probability, prediction, truth in outcomes:
|
||||
fo.write('{} {}\n'.format(problem, probability))
|
||||
y_pred.append(prediction)
|
||||
y_prob.append(probability)
|
||||
y_true.append(truth)
|
||||
acc_auc = evaluation(y_pred, y_prob, y_true)
|
||||
print('ACC * AUC = {:.3f}'.format(acc_auc))
|
||||
|
||||
|
||||
print('done')
|
||||
|
|
@ -1,33 +0,0 @@
|
|||
from matplotlib.cm import get_cmap
|
||||
|
||||
def color_tag(index, text, probability, cmap):
|
||||
probability *= 0.6
|
||||
# probability = (probability-0.5)*0.75+0.5
|
||||
r,g,b,_ = cmap(probability)
|
||||
# reliable = abs(probability-0.5) > 0.25*0.75
|
||||
# text = '<font color="white">{}</font>'.format(text) if reliable else text
|
||||
return f'<b> P{index}:</b> <a style="background-color:rgb({r*255:.0f},{g*255:.0f},{b*255:.0f});">{text} </a>'
|
||||
|
||||
def color(path, texts, probabilities, title, paragraph_offset=1):
|
||||
# cmap = get_cmap('RdYlGn')
|
||||
# cmap = get_cmap('Greens')
|
||||
cmap = get_cmap('Greys')
|
||||
|
||||
with open(path, 'wt') as fo:
|
||||
fo.write("""
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>{}</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>{}</h1>
|
||||
""".format(title,title))
|
||||
for i,(line,probability) in enumerate(zip(texts,probabilities)):
|
||||
fo.write(color_tag(paragraph_offset + i, line,probability,cmap))
|
||||
fo.write("""
|
||||
</body>
|
||||
</html>
|
||||
""")
|
||||
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
# Have colormaps separated into categories:
|
||||
# http://matplotlib.org/examples/color/colormaps_reference.html
|
||||
cmaps = [('Diverging', ['RdYlGn','RdYlGn']),]
|
||||
|
||||
|
||||
nrows = max(len(cmap_list) for cmap_category, cmap_list in cmaps)
|
||||
gradient = np.linspace(0.25, 0.75, 256)
|
||||
gradient = np.vstack((gradient, gradient))
|
||||
|
||||
|
||||
def plot_color_gradients(cmap_category, cmap_list, nrows):
|
||||
fig, axes = plt.subplots(nrows=nrows)
|
||||
fig.subplots_adjust(top=0.95, bottom=0.01, left=0.2, right=0.99)
|
||||
axes[0].set_title(cmap_category + ' colormaps', fontsize=14)
|
||||
|
||||
for ax, name in zip(axes, cmap_list):
|
||||
ax.imshow(gradient, aspect='auto', cmap=plt.get_cmap(name))
|
||||
pos = list(ax.get_position().bounds)
|
||||
x_text = pos[0] - 0.01
|
||||
y_text = pos[1] + pos[3]/2.
|
||||
fig.text(x_text, y_text, name, va='center', ha='right', fontsize=10)
|
||||
|
||||
# Turn off *all* ticks & spines, not just the ones with colormaps.
|
||||
for ax in axes:
|
||||
ax.set_axis_off()
|
||||
|
||||
|
||||
for cmap_category, cmap_list in cmaps:
|
||||
plot_color_gradients(cmap_category, cmap_list, nrows)
|
||||
|
||||
plt.show()
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
def warn(*args, **kwargs): pass
|
||||
import warnings
|
||||
warnings.warn = warn
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
def get_counters(true_labels, predicted_labels):
|
||||
assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels."
|
||||
nd = len(true_labels)
|
||||
tp = np.sum(predicted_labels[true_labels == 1])
|
||||
fp = np.sum(predicted_labels[true_labels == 0])
|
||||
fn = np.sum(true_labels[predicted_labels == 0])
|
||||
tn = nd - (tp+fp+fn)
|
||||
return tp, fp, fn, tn
|
||||
|
||||
|
||||
def f1_from_counters(tp, fp, fn, tn):
|
||||
num = 2.0 * tp
|
||||
den = 2.0 * tp + fp + fn
|
||||
if den > 0: return num / den
|
||||
# f1 is undefined when den==0; we define f1=1 if den==0 since the classifier has correctly classified all instances as negative
|
||||
return 1.0
|
||||
|
||||
|
||||
def f1(true_labels, predicted_labels):
|
||||
tp, fp, fn, tn = get_counters(true_labels,predicted_labels)
|
||||
return f1_from_counters(tp, fp, fn, tn)
|
||||
|
|
@ -1,59 +0,0 @@
|
|||
from sklearn.linear_model import LogisticRegression
|
||||
from data.dante_loader import load_texts
|
||||
from data.features import *
|
||||
from model import AuthorshipVerificator, f1_from_counters
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
for epistola in [2]:
|
||||
|
||||
author_attribution = []
|
||||
print(f'Epistola {epistola}')
|
||||
print('='*80)
|
||||
path = f'../testi_{epistola}'
|
||||
if epistola==2: path+='_tutti'
|
||||
|
||||
author = 'Dante'
|
||||
print('=' * 80)
|
||||
print('Corpus of Epistola {}'.format(epistola))
|
||||
print('=' * 80)
|
||||
|
||||
positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=f'EpistolaXIII_{epistola}.txt')
|
||||
|
||||
n_full_docs = len(positive) + len(negative)
|
||||
|
||||
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||
conjugations_freq='latin',
|
||||
features_Mendenhall=True,
|
||||
features_sentenceLengths=True,
|
||||
tfidf_feat_selection_ratio=0.1,
|
||||
wordngrams=True, n_wordngrams=(1, 2),
|
||||
charngrams=True, n_charngrams=(3, 4, 5),
|
||||
preserve_punctuation=False,
|
||||
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||
normalize_features=True)
|
||||
|
||||
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||
|
||||
print('Fitting the Verificator')
|
||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author)
|
||||
av.fit(Xtr, ytr, groups)
|
||||
|
||||
feat_rank = np.argsort(av.estimator.coef_[0])
|
||||
coef_ordered = av.estimator.coef_[0][feat_rank]
|
||||
feat_name_ordered = feature_extractor.feature_names[feat_rank]
|
||||
|
||||
print('Most Dantesque features::')
|
||||
for i in range(100):
|
||||
print(f'{i}: {feat_name_ordered[::-1][i]} {coef_ordered[::-1][i]:.3f}')
|
||||
|
||||
print('\nMost Non-Dantesque features::')
|
||||
for i in range(100):
|
||||
print(f'{i}: {feat_name_ordered[i]} {coef_ordered[i]:.3f}')
|
||||
|
||||
|
||||
print('done')
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue