This commit is contained in:
Alejandro Moreo Fernandez 2020-04-01 19:06:39 +02:00
parent d36e4ce9da
commit ce5d4ab843
2 changed files with 12 additions and 33 deletions

View File

@ -12,6 +12,7 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn
'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus',
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
def main():
discarded = 0
f1_scores = []
@ -21,7 +22,7 @@ def main():
print('='*80)
print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})')
print(f'Corpus {path}')
print('='*80)
print('-'*80)
positive, negative, pos_files, neg_files, ep_text = load_latin_corpus(
path, positive_author=author, unknown_target=args.unknown
@ -50,13 +51,14 @@ def main():
)
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
print('Fitting the Verificator')
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
av.fit(Xtr, ytr, groups)
if args.unknown:
print(f'Checking for the hypothesis that {author} was the author of {args.unknown}')
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
print('Fitting the Verificator')
av.fit(Xtr, ytr, groups)
av.predict_proba(ep, args.unknown)
if args.loo:
@ -87,12 +89,14 @@ if __name__ == '__main__':
# Training settings
parser = argparse.ArgumentParser(description='Authorship verification for Epistola XIII')
parser.add_argument('corpuspath', type=str, metavar='PATH',
help=f'Path to the directory containing the corpus (documents must be named <author>_<texname>.txt')
help=f'Path to the directory containing the corpus (documents must be named '
f'<author>_<texname>.txt')
parser.add_argument('positive', type=str, default="Dante",
help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check every author')
help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
f'every author')
parser.add_argument('--loo', default=False, action='store_true',
help='submit each binary classifier to leave-one-out validation')
parser.add_argument('--unknown', type=str, default=None,
parser.add_argument('--unknown', type=str, metavar='PATH', default=None,
help='path to the file of unknown paternity (default None)')
args = parser.parse_args()
@ -108,4 +112,4 @@ if __name__ == '__main__':
assert args.unknown or args.loo, 'error: nor an unknown document, nor LOO have been requested. Nothing to do.'
assert args.unknown is None or os.path.exists(args.unknown), 'unknown file does not exist'
main()
main()

View File

@ -1,25 +0,0 @@
import os
dir = '../../testi_1'
author = 'Misc'
file=author+'_Epistole.txt'
order = 0
epistola=[]
for line in open(os.path.join(dir,file), 'rt').readlines():
line = line.strip()
if line:
epistola.append(line)
else:
epistola = '\n'.join(epistola)
open(os.path.join(dir,'{}_epistola{}.txt'.format(author,order)), 'wt').write(epistola)
order += 1
epistola = []
if epistola:
epistola = '\n'.join(epistola)
open(os.path.join(dir, '{}_epistola{}.txt'.format(author,order)), 'wt').write(epistola)