diff --git a/src/author_identification.py b/src/author_identification.py new file mode 100644 index 0000000..3df2892 --- /dev/null +++ b/src/author_identification.py @@ -0,0 +1,78 @@ +from sklearn.linear_model import LogisticRegression +from data.dante_loader import load_texts +from data.features import * +from model import AuthorshipVerificator, f1_from_counters +from sklearn.svm import LinearSVC, SVC +from util.color_visualization import color + +# DONE: ngrams should contain punctuation marks according to Sapkota et al. [39] in the PAN 2015 overview +# (More recently, it was shown that character +# n-grams corresponding to word affixes and including punctuation marks are the most +# significant features in cross-topic authorship attribution [57].) #we have cancelled the +# TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection +# TODO: sentence length (Mendenhall-style) ? + + +for epistola in [2]: + if epistola==1: + authors = ['Dante','GiovanniBoccaccio','PierDellaVigna'] + else: + authors = ['Dante', 'BenvenutoDaImola', 'FilippoVillani','GiovanniBoccaccio','GiovanniDelVirgilio', + 'GrazioloBambaglioli','GuidoDaPisa','PietroAlighieri','ZonoDeMagnalis'] + + discarded = 0 + f1_scores = [] + counters = [] + for i,author in enumerate(authors): + print('='*80) + print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors))) + print('Corpus of Epistola {}'.format(epistola)) + print('='*80) + path = '../testi_{}'.format(epistola) + if epistola==2: + path+='_with_GuidoDaPisa' + + positive, negative, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola)) + if len(positive) < 2: + discarded+=1 + continue + + n_full_docs = len(positive) + len(negative) + + feature_extractor = FeatureExtractor(function_words_freq='latin', + conjugations_freq='latin', + features_Mendenhall=True, + tfidf_feat_selection_ratio=0.1, + wordngrams=False, n_wordngrams=(1, 2), + charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False, + split_documents=True, split_policy=split_by_sentences, window_size=3, + normalize_features=True) + + Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative) + print(ytr) + + ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3) + + print('Fitting the Verificator') + av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression) + av.fit(Xtr,ytr,groups) + + score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True) + # print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + f1_scores.append(f1_from_counters(tp, fp, fn, tn)) + counters.append((tp, fp, fn, tn)) + print('F1 for {} = {:.3f}'.format(author,f1_scores[-1])) + + + print('Computing macro- and micro-averages (discarded {}/{})'.format(discarded,len(authors))) + f1_scores = np.array(f1_scores) + counters = np.array(counters) + + macro_f1 = f1_scores.mean() + micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist()) + + print('Macro-F1 = {:.3f}'.format(macro_f1)) + print('Micro-F1 = {:.3f}'.format(micro_f1)) + print() + + diff --git a/src/dante_eval.py b/src/author_verification.py similarity index 72% rename from src/dante_eval.py rename to src/author_verification.py index c2d2b52..53218be 100644 --- a/src/dante_eval.py +++ b/src/author_verification.py @@ -1,7 +1,7 @@ from sklearn.linear_model import LogisticRegression from data.dante_loader import load_texts from data.features import * -from model import AuthorshipVerificator +from model import AuthorshipVerificator, f1_from_counters from sklearn.svm import LinearSVC, SVC from util.color_visualization import color @@ -12,14 +12,16 @@ from util.color_visualization import color # TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection # TODO: sentence length (Mendenhall-style) ? + for epistola in [1, 2]: + print('Epistola {}'.format(epistola)) print('='*80) path = '../testi_{}'.format(epistola) if epistola==2: path+='_with_GuidoDaPisa' - positive, negative, ep_text = load_texts(path, unknown_target='EpistolaXIII_{}.txt'.format(epistola)) + positive, negative, ep_text = load_texts(path, positive_author='Dante', unknown_target='EpistolaXIII_{}.txt'.format(epistola)) n_full_docs = len(positive) + len(negative) feature_extractor = FeatureExtractor(function_words_freq='latin', @@ -27,7 +29,7 @@ for epistola in [1, 2]: features_Mendenhall=True, tfidf_feat_selection_ratio=0.1, wordngrams=False, n_wordngrams=(1, 2), - charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False, + charngrams=True, n_charngrams=(2, 3, 4), preserve_punctuation=False, split_documents=True, split_policy=split_by_sentences, window_size=3, normalize_features=True) @@ -46,12 +48,14 @@ for epistola in [1, 2]: fulldoc_prob, fragment_probs = av.predict_proba(ep, title) # color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title) - score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False) - print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + # score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False) + # print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std)) - score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True) - print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True) + # print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + f1_ = f1_from_counters(tp, fp, fn, tn) + print('F1 = {:.3f}'.format(f1_)) - score_ave, score_std = av.leave_one_out(Xtr, ytr, None) - print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + # score_ave, score_std = av.leave_one_out(Xtr, ytr, None) + # print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std)) diff --git a/src/data/__pycache__/dante_loader.cpython-36.pyc b/src/data/__pycache__/dante_loader.cpython-36.pyc deleted file mode 100644 index a7b3e68..0000000 Binary files a/src/data/__pycache__/dante_loader.cpython-36.pyc and /dev/null differ diff --git a/src/data/__pycache__/features.cpython-36.pyc b/src/data/__pycache__/features.cpython-36.pyc deleted file mode 100644 index e47739a..0000000 Binary files a/src/data/__pycache__/features.cpython-36.pyc and /dev/null differ diff --git a/src/data/features.py b/src/data/features.py index 29a549f..576753f 100644 --- a/src/data/features.py +++ b/src/data/features.py @@ -9,7 +9,6 @@ import collections from nltk.corpus import stopwords - latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'vt', 'cvm', 'per', 'a', 'sed', 'qve', 'qvia', 'ex', 'sic', 'si', 'etiam', 'idest', 'nam', 'vnde', 'ab', 'vel', 'sicvt', 'ita', 'enim', 'scilicet', 'nec', 'pro', 'avtem', 'ibi', 'dvm', 'vero', 'tamen', 'inter', 'ideo', 'propter', 'contra', 'svb', @@ -18,15 +17,6 @@ latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'vt', 'cvm', 'per', 'qvidem', 'svpra', 'ante', 'adhvc', 'sev' , 'apvd', 'olim', 'statim', 'satis', 'ob', 'qvoniam', 'postea', 'nvnqvam'] -def get_function_words(lang): - if lang=='latin': - return latin_function_words - elif lang in ['english','spanish']: - return stopwords.words(lang) - else: - raise ValueError('{} not in scope!'.format(lang)) - - latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amvs', 'emvs', 'imvs', 'atis', 'etis', 'itis', 'ant', 'ent', 'vnt', 'ivnt', 'or', 'eor', 'ior', 'aris', 'eris', 'iris', 'atvr', 'etvr', 'itvr', 'amvr', 'emvr', 'imvr', 'amini', 'emini', 'imini', 'antvr', 'entvr', 'vntvr', 'ivntvr', @@ -55,11 +45,22 @@ spanish_conjugations = ['o','as','a','amos','áis','an','es','e','emos','éis',' 'aba', 'abas', 'ábamos', 'aban', 'ía', 'ías', 'íamos', 'íais', 'ían', 'ás','á', 'án','estoy','estás','está','estamos','estáis','están'] + +def get_function_words(lang): + if lang=='latin': + return latin_function_words + elif lang in ['english','spanish']: + return stopwords.words(lang) + else: + raise ValueError('{} not in scope!'.format(lang)) + def get_conjugations(lang): if lang == 'latin': return latin_conjugations + elif lang == 'spanish': + return spanish_conjugations else: - raise ValueError('conjugations for languages other than latin are not yet supported') + raise ValueError('conjugations for languages other than Latin and Spanish are not yet supported') # ------------------------------------------------------------------------ @@ -411,7 +412,7 @@ class FeatureExtractor: 'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}' .format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents, self.split_policy.__name__)) - print('Epistola 1 shape:', TEST.shape) + print('test shape:', TEST.shape) print() if return_fragments: diff --git a/src/model.py b/src/model.py index a6b06a1..38ae9a1 100644 --- a/src/model.py +++ b/src/model.py @@ -14,23 +14,31 @@ class RandomVerificator: def predict(self,test): return np.random.rand() -def f1(true_labels, predicted_labels): - assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels." +def get_counters(true_labels, predicted_labels): + assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels." nd = len(true_labels) - tp = np.sum(predicted_labels[true_labels==1]) + tp = np.sum(predicted_labels[true_labels == 1]) fp = np.sum(predicted_labels[true_labels == 0]) fn = np.sum(true_labels[predicted_labels == 0]) + tn = nd - (tp+fp+fn) + return tp,fp,fn,tn + +def f1_from_counters(tp,fp,fn,tn): num = 2.0 * tp den = 2.0 * tp + fp + fn if den > 0: return num / den # we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative return 1.0 +def f1(true_labels, predicted_labels): + tp, fp, fn, tn = get_counters(true_labels,predicted_labels) + return f1_from_counters(tp, fp, fn, tn ) + class AuthorshipVerificator: def __init__(self, nfolds=10, - params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced']}, + params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]}, estimator=SVC): self.nfolds = nfolds self.params = params @@ -70,7 +78,7 @@ class AuthorshipVerificator: return self - def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True): + def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True, counters=False): if groups is None: print('Computing LOO without groups') @@ -85,8 +93,15 @@ class AuthorshipVerificator: scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1) print(scores) - - return scores.mean(), scores.std() + if counters and test_lowest_index_only: + yfull_true = y[:len(folds)] + yfull_predict = np.zeros_like(yfull_true) + yfull_predict[scores == 1] = yfull_true[scores == 1] + yfull_predict[scores != 1] = 1-yfull_true[scores != 1] + tp, fp, fn, tn = get_counters(yfull_true, yfull_predict) + return scores.mean(), scores.std(), tp, fp, fn, tn + else: + return scores.mean(), scores.std() def predict(self, test, epistola_name=''): pred = self.estimator.predict(test) diff --git a/src/pan2015_eval.py b/src/pan2015_eval.py index d22c672..7c3b9da 100644 --- a/src/pan2015_eval.py +++ b/src/pan2015_eval.py @@ -32,6 +32,7 @@ def evaluation(y_pred, y_prob, y_true): def doall(problem,pos,neg,test,truth): print('[Start]{}'.format(problem)) feature_extractor = FeatureExtractor(function_words_freq=lang, + conjugations_freq=lang, features_Mendenhall=True, wordngrams=False, tfidf_feat_selection_ratio=0.1, charngrams=True, n_charngrams=[3, 4, 5],