dante-verification/src/author_identification_loo.py

137 lines
5.7 KiB
Python
Executable File

#import util._hide_sklearn_warnings
from data.dante_loader import load_latin_corpus, list_authors
from data.features import *
from model import AuthorshipVerificator, RangeFeatureSelector, leave_one_out
from util.evaluation import f1_from_counters
import argparse
from sklearn.pipeline import Pipeline
AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', 'GrazioloBambaglioli', 'GuidoDaPisa',
'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', 'IohannesDePlanoCarpini',
'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus',
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
DEBUG_MODE = True
def main():
log = open(args.log, 'wt')
discarded = 0
f1_scores = []
counters = []
for i, author in enumerate(args.authors):
path = args.corpuspath
print('='*80)
print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})')
print(f'Corpus {path}')
print('-'*80)
positive, negative, pos_files, neg_files, ep_text = load_latin_corpus(path, positive_author=author)
files = np.asarray(pos_files + neg_files)
if len(positive) < 2:
discarded += 1
print(f'discarding analysis for {author} which has only {len(positive)} documents')
continue
n_full_docs = len(positive) + len(negative)
print(f'read {n_full_docs} documents from {path}')
feature_extractor = FeatureExtractor(
function_words_freq='latin',
conjugations_freq='latin',
features_Mendenhall=True,
features_sentenceLengths=True,
feature_selection_ratio=0.05 if DEBUG_MODE else 1,
wordngrams=True, n_wordngrams=(1, 2),
charngrams=True, n_charngrams=(3, 4, 5),
preserve_punctuation=False,
split_documents=True,
split_policy=split_by_sentences,
window_size=3,
normalize_features=True
)
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
print('Fitting the Verificator')
#params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)}
params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': [1,10,100,1000,0.1,0.01,0.001]}
slice_charngrams = feature_extractor.feature_range['_cngrams_task']
slice_wordngrams = feature_extractor.feature_range['_wngrams_task']
if slice_charngrams.start < slice_wordngrams.start:
slice_first, slice_second = slice_charngrams, slice_wordngrams
else:
slice_first, slice_second = slice_wordngrams, slice_charngrams
av = Pipeline([
('featsel_cngrams', RangeFeatureSelector(slice_second, 0.05)),
('featsel_wngrams', RangeFeatureSelector(slice_first, 0.05)),
('av', AuthorshipVerificator(C=1, param_grid=params))
])
print('Validating the Verificator (Leave-One-Out)')
score_ave, score_std, tp, fp, fn, tn = leave_one_out(
av, Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
)
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
counters.append((tp, fp, fn, tn))
tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log)
print(f'TP={tp} FP={fp} FN={fn} TN={tn}')
print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
f1_scores = np.array(f1_scores)
counters = np.array(counters)
macro_f1 = f1_scores.mean()
micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
tee(f'LOO Macro-F1 = {macro_f1:.3f}', log)
tee(f'LOO Micro-F1 = {micro_f1:.3f}', log)
print()
log.close()
if DEBUG_MODE:
print('DEBUG_MODE ON')
def tee(msg, log):
print(msg)
log.write(f'{msg}\n')
log.flush()
if __name__ == '__main__':
import os
# Training settings
parser = argparse.ArgumentParser(description='Authorship verification for MedLatin '
'submit each binary classifier to leave-one-out validation')
parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH',
help=f'Path to the directory containing the corpus (documents must be named '
f'<author>_<texname>.txt)')
parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
f'every author')
parser.add_argument('--log', type=str, metavar='PATH', default=None,
help='path to the log file where to write the results '
'(if not specified, then ./results_{corpuspath.name})')
args = parser.parse_args()
if args.positive == 'ALL':
args.authors = list_authors(args.corpuspath, skip_prefix='Epistola')
else:
if (args.positive not in AUTHORS_CORPUS_I) and (args.positive in AUTHORS_CORPUS_II):
print(f'warning: author {args.positive} is not in the known list of authors for CORPUS I nor CORPUS II')
assert args.positive in list_authors(args.corpuspath, skip_prefix='Epistola'), 'unexpected author'
args.authors = [args.positive]
assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist'
main()