From ce5d4ab84322780a49e67559b9e1231db6ef84f5 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Wed, 1 Apr 2020 19:06:39 +0200 Subject: [PATCH] bugfix --- src/author_identification.py | 20 ++++++++++++-------- src/util/epistole_split.py | 25 ------------------------- 2 files changed, 12 insertions(+), 33 deletions(-) delete mode 100755 src/util/epistole_split.py diff --git a/src/author_identification.py b/src/author_identification.py index 5764988..30fb6e4 100755 --- a/src/author_identification.py +++ b/src/author_identification.py @@ -12,6 +12,7 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus', 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] + def main(): discarded = 0 f1_scores = [] @@ -21,7 +22,7 @@ def main(): print('='*80) print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})') print(f'Corpus {path}') - print('='*80) + print('-'*80) positive, negative, pos_files, neg_files, ep_text = load_latin_corpus( path, positive_author=author, unknown_target=args.unknown @@ -50,13 +51,14 @@ def main(): ) Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) + + print('Fitting the Verificator') av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression) + av.fit(Xtr, ytr, groups) + if args.unknown: print(f'Checking for the hypothesis that {author} was the author of {args.unknown}') ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3) - - print('Fitting the Verificator') - av.fit(Xtr, ytr, groups) av.predict_proba(ep, args.unknown) if args.loo: @@ -87,12 +89,14 @@ if __name__ == '__main__': # Training settings parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII') parser.add_argument('corpuspath', type=str, metavar='PATH', - help=f'Path to the directory containing the corpus (documents must be named _.txt') + help=f'Path to the directory containing the corpus (documents must be named ' + f'_.txt') parser.add_argument('positive', type=str, default="Dante", - help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check every author') + help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check ' + f'every author') parser.add_argument('--loo', default=False, action='store_true', help='submit each binary classifier to leave-one-out validation') - parser.add_argument('--unknown', type=str, default=None, + parser.add_argument('--unknown', type=str, metavar='PATH', default=None, help='path to the file of unknown paternity (default None)') args = parser.parse_args() @@ -108,4 +112,4 @@ if __name__ == '__main__': assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.' assert args.unknown is None or os.path.exists(args.unknown), 'unknown file does not exist' - main() \ No newline at end of file + main() diff --git a/src/util/epistole_split.py b/src/util/epistole_split.py deleted file mode 100755 index 5f458e5..0000000 --- a/src/util/epistole_split.py +++ /dev/null @@ -1,25 +0,0 @@ -import os - -dir = '../../testi_1' -author = 'Misc' -file=author+'_Epistole.txt' - - - -order = 0 -epistola=[] -for line in open(os.path.join(dir,file), 'rt').readlines(): - line = line.strip() - if line: - epistola.append(line) - else: - epistola = '\n'.join(epistola) - open(os.path.join(dir,'{}_epistola{}.txt'.format(author,order)), 'wt').write(epistola) - order += 1 - epistola = [] - -if epistola: - epistola = '\n'.join(epistola) - open(os.path.join(dir, '{}_epistola{}.txt'.format(author,order)), 'wt').write(epistola) - -