This commit is contained in:
Alejandro Moreo Fernandez 2020-04-01 19:06:39 +02:00
parent d36e4ce9da
commit ce5d4ab843
2 changed files with 12 additions and 33 deletions

View File

@ -12,6 +12,7 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn
'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus', 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus',
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
def main(): def main():
discarded = 0 discarded = 0
f1_scores = [] f1_scores = []
@ -21,7 +22,7 @@ def main():
print('='*80) print('='*80)
print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})') print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})')
print(f'Corpus {path}') print(f'Corpus {path}')
print('='*80) print('-'*80)
positive, negative, pos_files, neg_files, ep_text = load_latin_corpus( positive, negative, pos_files, neg_files, ep_text = load_latin_corpus(
path, positive_author=author, unknown_target=args.unknown path, positive_author=author, unknown_target=args.unknown
@ -50,13 +51,14 @@ def main():
) )
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
print('Fitting the Verificator')
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression) av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
av.fit(Xtr, ytr, groups)
if args.unknown: if args.unknown:
print(f'Checking for the hypothesis that {author} was the author of {args.unknown}') print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3) ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
print('Fitting the Verificator')
av.fit(Xtr, ytr, groups)
av.predict_proba(ep, args.unknown) av.predict_proba(ep, args.unknown)
if args.loo: if args.loo:
@ -87,12 +89,14 @@ if __name__ == '__main__':
# Training settings # Training settings
parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII') parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII')
parser.add_argument('corpuspath', type=str, metavar='PATH', parser.add_argument('corpuspath', type=str, metavar='PATH',
help=f'Path to the directory containing the corpus (documents must be named <author>_<texname>.txt') help=f'Path to the directory containing the corpus (documents must be named '
f'<author>_<texname>.txt')
parser.add_argument('positive', type=str, default="Dante", parser.add_argument('positive', type=str, default="Dante",
help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check every author') help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
f'every author')
parser.add_argument('--loo', default=False, action='store_true', parser.add_argument('--loo', default=False, action='store_true',
help='submit each binary classifier to leave-one-out validation') help='submit each binary classifier to leave-one-out validation')
parser.add_argument('--unknown', type=str, default=None, parser.add_argument('--unknown', type=str, metavar='PATH', default=None,
help='path to the file of unknown paternity (default None)') help='path to the file of unknown paternity (default None)')
args = parser.parse_args() args = parser.parse_args()
@ -108,4 +112,4 @@ if __name__ == '__main__':
assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.' assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.'
assert args.unknown is None or os.path.exists(args.unknown), 'unknown file does not exist' assert args.unknown is None or os.path.exists(args.unknown), 'unknown file does not exist'
main() main()

View File

@ -1,25 +0,0 @@
import os
dir = '../../testi_1'
author = 'Misc'
file=author+'_Epistole.txt'
order = 0
epistola=[]
for line in open(os.path.join(dir,file), 'rt').readlines():
line = line.strip()
if line:
epistola.append(line)
else:
epistola = '\n'.join(epistola)
open(os.path.join(dir,'{}_epistola{}.txt'.format(author,order)), 'wt').write(epistola)
order += 1
epistola = []
if epistola:
epistola = '\n'.join(epistola)
open(os.path.join(dir, '{}_epistola{}.txt'.format(author,order)), 'wt').write(epistola)