diff --git a/src/author_identification.py b/src/author_identification.py index 1cbe4ca..21bd6d3 100755 --- a/src/author_identification.py +++ b/src/author_identification.py @@ -2,15 +2,6 @@ from sklearn.linear_model import LogisticRegression from data.dante_loader import load_texts from data.features import * from model import AuthorshipVerificator, f1_from_counters -from sklearn.svm import LinearSVC, SVC -from util.color_visualization import color - -# DONE: ngrams should contain punctuation marks according to Sapkota et al. [39] in the PAN 2015 overview -# (More recently, it was shown that character -# n-grams corresponding to word affixes and including punctuation marks are the most -# significant features in cross-topic authorship attribution [57].) #we have cancelled the -# TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection -# TODO: sentence length (Mendenhall-style) ? for epistola in [1]: @@ -37,7 +28,9 @@ for epistola in [1]: if epistola==2: path+='_interaEpistola' - positive, negative, pos_files, neg_files, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola)) + positive, negative, pos_files, neg_files, ep_text = load_texts( + path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola) + ) files = np.asarray(pos_files + neg_files) if len(positive) < 2: discarded+=1 @@ -45,16 +38,20 @@ for epistola in [1]: n_full_docs = len(positive) + len(negative) - feature_extractor = FeatureExtractor(function_words_freq='latin', - conjugations_freq='latin', - features_Mendenhall=True, - features_sentenceLengths=True, - tfidf_feat_selection_ratio=0.1, - wordngrams=True, n_wordngrams=(1, 2), - charngrams=True, n_charngrams=(3, 4, 5), - preserve_punctuation=False, - split_documents=True, split_policy=split_by_sentences, window_size=3, - normalize_features=True) + feature_extractor = FeatureExtractor( + function_words_freq='latin', + conjugations_freq='latin', + features_Mendenhall=True, + features_sentenceLengths=True, + tfidf_feat_selection_ratio=0.1, + wordngrams=True, n_wordngrams=(1, 2), + charngrams=True, n_charngrams=(3, 4, 5), + preserve_punctuation=False, + split_documents=True, + split_policy=split_by_sentences, + window_size=3, + normalize_features=True + ) Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative) @@ -66,22 +63,23 @@ for epistola in [1]: av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression) av.fit(Xtr,ytr,groups) - score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True) - # print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + score_ave, score_std, tp, fp, fn, tn = av.leave_one_out( + Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True + ) f1_scores.append(f1_from_counters(tp, fp, fn, tn)) counters.append((tp, fp, fn, tn)) - print('F1 for {} = {:.3f}'.format(author,f1_scores[-1])) + print('F1 for {author} = {f1_scores[-1]:.3f}') - print('Computing macro- and micro-averages (discarded {}/{})'.format(discarded,len(authors))) + print(f'Computing macro- and micro-averages (discarded {discarded}/{len(authors)})') f1_scores = np.array(f1_scores) counters = np.array(counters) macro_f1 = f1_scores.mean() micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist()) - print('Macro-F1 = {:.3f}'.format(macro_f1)) - print('Micro-F1 = {:.3f}'.format(micro_f1)) + print(f'Macro-F1 = {macro_f1:.3f}') + print(f'Micro-F1 = {micro_f1:.3f}') print() diff --git a/src/data/features.py b/src/data/features.py index 5b742b0..e36418a 100755 --- a/src/data/features.py +++ b/src/data/features.py @@ -229,6 +229,11 @@ def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)): return features, tfidf_vectorizer +# We have implemented ngrams extration generically, following Sapkota et al. (ref [39] in the PAN 2015 overview), i.e., +# containing punctuation marks. However, this does not apply to this study since punctuation marks are filtered-out in +# editions of Latin texts. +# More recently, it was shown that character n-grams corresponding to word affixes and including punctuation +# marks are the most significant features in cross-topic authorship attribution [57]. def _features_ngrams(documents, ns=[4, 5], ngrams_vectorizer=None, min_df = 10, preserve_punctuation=True): doc_ngrams = ngrams_extractor(documents, ns, preserve_punctuation) return _features_tfidf(doc_ngrams, tfidf_vectorizer=ngrams_vectorizer, min_df = min_df) @@ -507,6 +512,7 @@ class FeatureExtractor: + if __name__=='__main__': from collections import Counter @@ -594,4 +600,3 @@ if __name__=='__main__': dots=False print(counter) print('rows',rows) -