dante-verification/src/author_identification_loo.py

#import util._hide_sklearn_warnings
from data.dante_loader import load_latin_corpus, list_authors
from data.features import *
from model import AuthorshipVerificator, RangeFeatureSelector, leave_one_out
from util.evaluation import f1_from_counters
import argparse
from sklearn.pipeline import Pipeline

AUTHORS_CORPUS_I = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna']
AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis',
                           'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', 'GrazioloBambaglioli', 'GuidoDaPisa',
                           'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', 'IohannesDePlanoCarpini',
                           'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', 'PietroAlighieri', 'RaimundusLullus',
                           'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']


DEBUG_MODE = True


def main():
    log = open(args.log, 'wt')
    discarded = 0
    f1_scores = []
    counters = []
    for i, author in enumerate(args.authors):
        path = args.corpuspath
        print('='*80)
        print(f'Authorship Identification for {author} (complete {i}/{len(args.authors)})')
        print(f'Corpus {path}')
        print('-'*80)

        positive, negative, pos_files, neg_files, ep_text = load_latin_corpus(path, positive_author=author)
        files = np.asarray(pos_files + neg_files)
        if len(positive) < 2:
            discarded += 1
            print(f'discarding analysis for {author} which has only {len(positive)} documents')
            continue

        n_full_docs = len(positive) + len(negative)
        print(f'read {n_full_docs} documents from {path}')

        feature_extractor = FeatureExtractor(
            function_words_freq='latin',
            conjugations_freq='latin',
            features_Mendenhall=True,
            features_sentenceLengths=True,
            feature_selection_ratio=0.05 if DEBUG_MODE else 1,
            wordngrams=True, n_wordngrams=(1, 2),
            charngrams=True, n_charngrams=(3, 4, 5),
            preserve_punctuation=False,
            split_documents=True,
            split_policy=split_by_sentences,
            window_size=3,
            normalize_features=True
        )

        Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)

        print('Fitting the Verificator')
        #params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)}
        params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': [1,10,100,1000,0.1,0.01,0.001]}

        slice_charngrams = feature_extractor.feature_range['_cngrams_task']
        slice_wordngrams = feature_extractor.feature_range['_wngrams_task']
        if slice_charngrams.start < slice_wordngrams.start:
            slice_first, slice_second = slice_charngrams, slice_wordngrams
        else:
            slice_first, slice_second = slice_wordngrams, slice_charngrams
        av = Pipeline([
            ('featsel_cngrams', RangeFeatureSelector(slice_second, 0.05)),
            ('featsel_wngrams', RangeFeatureSelector(slice_first, 0.05)),
            ('av', AuthorshipVerificator(C=1, param_grid=params))
        ])

        print('Validating the Verificator (Leave-One-Out)')
        score_ave, score_std, tp, fp, fn, tn = leave_one_out(
            av, Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
        )
        f1_scores.append(f1_from_counters(tp, fp, fn, tn))
        counters.append((tp, fp, fn, tn))
        tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log)
        print(f'TP={tp} FP={fp} FN={fn} TN={tn}')

    print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})')
    f1_scores = np.array(f1_scores)
    counters = np.array(counters)

    macro_f1 = f1_scores.mean()
    micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())

    tee(f'LOO Macro-F1 = {macro_f1:.3f}', log)
    tee(f'LOO Micro-F1 = {micro_f1:.3f}', log)
    print()

    log.close()

    if DEBUG_MODE:
        print('DEBUG_MODE ON')


def tee(msg, log):
    print(msg)
    log.write(f'{msg}\n')
    log.flush()


if __name__ == '__main__':
    import os

    # Training settings
    parser = argparse.ArgumentParser(description='Authorship verification for MedLatin '
                                                 'submit each binary classifier to leave-one-out validation')
    parser.add_argument('corpuspath', type=str, metavar='CORPUSPATH',
                        help=f'Path to the directory containing the corpus (documents must be named '
                             f'<author>_<texname>.txt)')
    parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
                        help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
                              f'every author')
    parser.add_argument('--log', type=str, metavar='PATH', default=None,
                        help='path to the log file where to write the results '
                             '(if not specified, then ./results_{corpuspath.name})')

    args = parser.parse_args()

    if args.positive == 'ALL':
        args.authors = list_authors(args.corpuspath, skip_prefix='Epistola')
    else:
        if (args.positive not in AUTHORS_CORPUS_I) and (args.positive in AUTHORS_CORPUS_II):
            print(f'warning: author {args.positive} is not in the known list of authors for CORPUS I nor CORPUS II')
        assert args.positive in list_authors(args.corpuspath, skip_prefix='Epistola'), 'unexpected author'
        args.authors = [args.positive]

    assert os.path.exists(args.corpuspath), f'corpus path {args.corpuspath} does not exist'

    main()