identification vs attribution, macro-f1 and micro-f1
This commit is contained in:
parent
14d5f6e531
commit
1387ef2c59
|
|
@ -0,0 +1,78 @@
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from data.dante_loader import load_texts
|
||||||
|
from data.features import *
|
||||||
|
from model import AuthorshipVerificator, f1_from_counters
|
||||||
|
from sklearn.svm import LinearSVC, SVC
|
||||||
|
from util.color_visualization import color
|
||||||
|
|
||||||
|
# DONE: ngrams should contain punctuation marks according to Sapkota et al. [39] in the PAN 2015 overview
|
||||||
|
# (More recently, it was shown that character
|
||||||
|
# n-grams corresponding to word affixes and including punctuation marks are the most
|
||||||
|
# significant features in cross-topic authorship attribution [57].) #we have cancelled the
|
||||||
|
# TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection
|
||||||
|
# TODO: sentence length (Mendenhall-style) ?
|
||||||
|
|
||||||
|
|
||||||
|
for epistola in [2]:
|
||||||
|
if epistola==1:
|
||||||
|
authors = ['Dante','GiovanniBoccaccio','PierDellaVigna']
|
||||||
|
else:
|
||||||
|
authors = ['Dante', 'BenvenutoDaImola', 'FilippoVillani','GiovanniBoccaccio','GiovanniDelVirgilio',
|
||||||
|
'GrazioloBambaglioli','GuidoDaPisa','PietroAlighieri','ZonoDeMagnalis']
|
||||||
|
|
||||||
|
discarded = 0
|
||||||
|
f1_scores = []
|
||||||
|
counters = []
|
||||||
|
for i,author in enumerate(authors):
|
||||||
|
print('='*80)
|
||||||
|
print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors)))
|
||||||
|
print('Corpus of Epistola {}'.format(epistola))
|
||||||
|
print('='*80)
|
||||||
|
path = '../testi_{}'.format(epistola)
|
||||||
|
if epistola==2:
|
||||||
|
path+='_with_GuidoDaPisa'
|
||||||
|
|
||||||
|
positive, negative, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
|
||||||
|
if len(positive) < 2:
|
||||||
|
discarded+=1
|
||||||
|
continue
|
||||||
|
|
||||||
|
n_full_docs = len(positive) + len(negative)
|
||||||
|
|
||||||
|
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||||
|
conjugations_freq='latin',
|
||||||
|
features_Mendenhall=True,
|
||||||
|
tfidf_feat_selection_ratio=0.1,
|
||||||
|
wordngrams=False, n_wordngrams=(1, 2),
|
||||||
|
charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False,
|
||||||
|
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||||
|
normalize_features=True)
|
||||||
|
|
||||||
|
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
||||||
|
print(ytr)
|
||||||
|
|
||||||
|
ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3)
|
||||||
|
|
||||||
|
print('Fitting the Verificator')
|
||||||
|
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
|
||||||
|
av.fit(Xtr,ytr,groups)
|
||||||
|
|
||||||
|
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True)
|
||||||
|
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||||
|
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
|
||||||
|
counters.append((tp, fp, fn, tn))
|
||||||
|
print('F1 for {} = {:.3f}'.format(author,f1_scores[-1]))
|
||||||
|
|
||||||
|
|
||||||
|
print('Computing macro- and micro-averages (discarded {}/{})'.format(discarded,len(authors)))
|
||||||
|
f1_scores = np.array(f1_scores)
|
||||||
|
counters = np.array(counters)
|
||||||
|
|
||||||
|
macro_f1 = f1_scores.mean()
|
||||||
|
micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
|
||||||
|
|
||||||
|
print('Macro-F1 = {:.3f}'.format(macro_f1))
|
||||||
|
print('Micro-F1 = {:.3f}'.format(micro_f1))
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from data.dante_loader import load_texts
|
from data.dante_loader import load_texts
|
||||||
from data.features import *
|
from data.features import *
|
||||||
from model import AuthorshipVerificator
|
from model import AuthorshipVerificator, f1_from_counters
|
||||||
from sklearn.svm import LinearSVC, SVC
|
from sklearn.svm import LinearSVC, SVC
|
||||||
from util.color_visualization import color
|
from util.color_visualization import color
|
||||||
|
|
||||||
|
|
@ -12,14 +12,16 @@ from util.color_visualization import color
|
||||||
# TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection
|
# TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection
|
||||||
# TODO: sentence length (Mendenhall-style) ?
|
# TODO: sentence length (Mendenhall-style) ?
|
||||||
|
|
||||||
|
|
||||||
for epistola in [1, 2]:
|
for epistola in [1, 2]:
|
||||||
|
|
||||||
print('Epistola {}'.format(epistola))
|
print('Epistola {}'.format(epistola))
|
||||||
print('='*80)
|
print('='*80)
|
||||||
path = '../testi_{}'.format(epistola)
|
path = '../testi_{}'.format(epistola)
|
||||||
if epistola==2:
|
if epistola==2:
|
||||||
path+='_with_GuidoDaPisa'
|
path+='_with_GuidoDaPisa'
|
||||||
|
|
||||||
positive, negative, ep_text = load_texts(path, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
|
positive, negative, ep_text = load_texts(path, positive_author='Dante', unknown_target='EpistolaXIII_{}.txt'.format(epistola))
|
||||||
n_full_docs = len(positive) + len(negative)
|
n_full_docs = len(positive) + len(negative)
|
||||||
|
|
||||||
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
||||||
|
|
@ -27,7 +29,7 @@ for epistola in [1, 2]:
|
||||||
features_Mendenhall=True,
|
features_Mendenhall=True,
|
||||||
tfidf_feat_selection_ratio=0.1,
|
tfidf_feat_selection_ratio=0.1,
|
||||||
wordngrams=False, n_wordngrams=(1, 2),
|
wordngrams=False, n_wordngrams=(1, 2),
|
||||||
charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False,
|
charngrams=True, n_charngrams=(2, 3, 4), preserve_punctuation=False,
|
||||||
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
||||||
normalize_features=True)
|
normalize_features=True)
|
||||||
|
|
||||||
|
|
@ -46,12 +48,14 @@ for epistola in [1, 2]:
|
||||||
fulldoc_prob, fragment_probs = av.predict_proba(ep, title)
|
fulldoc_prob, fragment_probs = av.predict_proba(ep, title)
|
||||||
# color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title)
|
# color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title)
|
||||||
|
|
||||||
score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False)
|
# score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False)
|
||||||
print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
# print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||||
|
|
||||||
score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True)
|
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True)
|
||||||
print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||||
|
f1_ = f1_from_counters(tp, fp, fn, tn)
|
||||||
|
print('F1 = {:.3f}'.format(f1_))
|
||||||
|
|
||||||
score_ave, score_std = av.leave_one_out(Xtr, ytr, None)
|
# score_ave, score_std = av.leave_one_out(Xtr, ytr, None)
|
||||||
print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
# print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
||||||
|
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -9,7 +9,6 @@ import collections
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'vt', 'cvm', 'per', 'a', 'sed', 'qve', 'qvia', 'ex', 'sic',
|
latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'vt', 'cvm', 'per', 'a', 'sed', 'qve', 'qvia', 'ex', 'sic',
|
||||||
'si', 'etiam', 'idest', 'nam', 'vnde', 'ab', 'vel', 'sicvt', 'ita', 'enim', 'scilicet', 'nec',
|
'si', 'etiam', 'idest', 'nam', 'vnde', 'ab', 'vel', 'sicvt', 'ita', 'enim', 'scilicet', 'nec',
|
||||||
'pro', 'avtem', 'ibi', 'dvm', 'vero', 'tamen', 'inter', 'ideo', 'propter', 'contra', 'svb',
|
'pro', 'avtem', 'ibi', 'dvm', 'vero', 'tamen', 'inter', 'ideo', 'propter', 'contra', 'svb',
|
||||||
|
|
@ -18,15 +17,6 @@ latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'vt', 'cvm', 'per',
|
||||||
'qvidem', 'svpra', 'ante', 'adhvc', 'sev' , 'apvd', 'olim', 'statim', 'satis', 'ob', 'qvoniam',
|
'qvidem', 'svpra', 'ante', 'adhvc', 'sev' , 'apvd', 'olim', 'statim', 'satis', 'ob', 'qvoniam',
|
||||||
'postea', 'nvnqvam']
|
'postea', 'nvnqvam']
|
||||||
|
|
||||||
def get_function_words(lang):
|
|
||||||
if lang=='latin':
|
|
||||||
return latin_function_words
|
|
||||||
elif lang in ['english','spanish']:
|
|
||||||
return stopwords.words(lang)
|
|
||||||
else:
|
|
||||||
raise ValueError('{} not in scope!'.format(lang))
|
|
||||||
|
|
||||||
|
|
||||||
latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amvs', 'emvs', 'imvs', 'atis', 'etis',
|
latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amvs', 'emvs', 'imvs', 'atis', 'etis',
|
||||||
'itis', 'ant', 'ent', 'vnt', 'ivnt', 'or', 'eor', 'ior', 'aris', 'eris', 'iris', 'atvr', 'etvr',
|
'itis', 'ant', 'ent', 'vnt', 'ivnt', 'or', 'eor', 'ior', 'aris', 'eris', 'iris', 'atvr', 'etvr',
|
||||||
'itvr', 'amvr', 'emvr', 'imvr', 'amini', 'emini', 'imini', 'antvr', 'entvr', 'vntvr', 'ivntvr',
|
'itvr', 'amvr', 'emvr', 'imvr', 'amini', 'emini', 'imini', 'antvr', 'entvr', 'vntvr', 'ivntvr',
|
||||||
|
|
@ -55,11 +45,22 @@ spanish_conjugations = ['o','as','a','amos','áis','an','es','e','emos','éis','
|
||||||
'aba', 'abas', 'ábamos', 'aban', 'ía', 'ías', 'íamos', 'íais', 'ían', 'ás','á',
|
'aba', 'abas', 'ábamos', 'aban', 'ía', 'ías', 'íamos', 'íais', 'ían', 'ás','á',
|
||||||
'án','estoy','estás','está','estamos','estáis','están']
|
'án','estoy','estás','está','estamos','estáis','están']
|
||||||
|
|
||||||
|
|
||||||
|
def get_function_words(lang):
|
||||||
|
if lang=='latin':
|
||||||
|
return latin_function_words
|
||||||
|
elif lang in ['english','spanish']:
|
||||||
|
return stopwords.words(lang)
|
||||||
|
else:
|
||||||
|
raise ValueError('{} not in scope!'.format(lang))
|
||||||
|
|
||||||
def get_conjugations(lang):
|
def get_conjugations(lang):
|
||||||
if lang == 'latin':
|
if lang == 'latin':
|
||||||
return latin_conjugations
|
return latin_conjugations
|
||||||
|
elif lang == 'spanish':
|
||||||
|
return spanish_conjugations
|
||||||
else:
|
else:
|
||||||
raise ValueError('conjugations for languages other than latin are not yet supported')
|
raise ValueError('conjugations for languages other than Latin and Spanish are not yet supported')
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------
|
# ------------------------------------------------------------------------
|
||||||
|
|
@ -411,7 +412,7 @@ class FeatureExtractor:
|
||||||
'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
|
'load_documents: function_words_freq={} features_Mendenhall={} tfidf={}, split_documents={}, split_policy={}'
|
||||||
.format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
|
.format(self.function_words_freq, self.features_Mendenhall, self.tfidf, self.split_documents,
|
||||||
self.split_policy.__name__))
|
self.split_policy.__name__))
|
||||||
print('Epistola 1 shape:', TEST.shape)
|
print('test shape:', TEST.shape)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
if return_fragments:
|
if return_fragments:
|
||||||
|
|
|
||||||
29
src/model.py
29
src/model.py
|
|
@ -14,23 +14,31 @@ class RandomVerificator:
|
||||||
def predict(self,test):
|
def predict(self,test):
|
||||||
return np.random.rand()
|
return np.random.rand()
|
||||||
|
|
||||||
def f1(true_labels, predicted_labels):
|
def get_counters(true_labels, predicted_labels):
|
||||||
assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels."
|
assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels."
|
||||||
nd = len(true_labels)
|
nd = len(true_labels)
|
||||||
tp = np.sum(predicted_labels[true_labels==1])
|
tp = np.sum(predicted_labels[true_labels == 1])
|
||||||
fp = np.sum(predicted_labels[true_labels == 0])
|
fp = np.sum(predicted_labels[true_labels == 0])
|
||||||
fn = np.sum(true_labels[predicted_labels == 0])
|
fn = np.sum(true_labels[predicted_labels == 0])
|
||||||
|
tn = nd - (tp+fp+fn)
|
||||||
|
return tp,fp,fn,tn
|
||||||
|
|
||||||
|
def f1_from_counters(tp,fp,fn,tn):
|
||||||
num = 2.0 * tp
|
num = 2.0 * tp
|
||||||
den = 2.0 * tp + fp + fn
|
den = 2.0 * tp + fp + fn
|
||||||
if den > 0: return num / den
|
if den > 0: return num / den
|
||||||
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
|
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
|
||||||
return 1.0
|
return 1.0
|
||||||
|
|
||||||
|
def f1(true_labels, predicted_labels):
|
||||||
|
tp, fp, fn, tn = get_counters(true_labels,predicted_labels)
|
||||||
|
return f1_from_counters(tp, fp, fn, tn )
|
||||||
|
|
||||||
|
|
||||||
class AuthorshipVerificator:
|
class AuthorshipVerificator:
|
||||||
|
|
||||||
def __init__(self, nfolds=10,
|
def __init__(self, nfolds=10,
|
||||||
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced']},
|
params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]},
|
||||||
estimator=SVC):
|
estimator=SVC):
|
||||||
self.nfolds = nfolds
|
self.nfolds = nfolds
|
||||||
self.params = params
|
self.params = params
|
||||||
|
|
@ -70,7 +78,7 @@ class AuthorshipVerificator:
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True):
|
def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True, counters=False):
|
||||||
|
|
||||||
if groups is None:
|
if groups is None:
|
||||||
print('Computing LOO without groups')
|
print('Computing LOO without groups')
|
||||||
|
|
@ -85,8 +93,15 @@ class AuthorshipVerificator:
|
||||||
|
|
||||||
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1)
|
||||||
print(scores)
|
print(scores)
|
||||||
|
if counters and test_lowest_index_only:
|
||||||
return scores.mean(), scores.std()
|
yfull_true = y[:len(folds)]
|
||||||
|
yfull_predict = np.zeros_like(yfull_true)
|
||||||
|
yfull_predict[scores == 1] = yfull_true[scores == 1]
|
||||||
|
yfull_predict[scores != 1] = 1-yfull_true[scores != 1]
|
||||||
|
tp, fp, fn, tn = get_counters(yfull_true, yfull_predict)
|
||||||
|
return scores.mean(), scores.std(), tp, fp, fn, tn
|
||||||
|
else:
|
||||||
|
return scores.mean(), scores.std()
|
||||||
|
|
||||||
def predict(self, test, epistola_name=''):
|
def predict(self, test, epistola_name=''):
|
||||||
pred = self.estimator.predict(test)
|
pred = self.estimator.predict(test)
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,7 @@ def evaluation(y_pred, y_prob, y_true):
|
||||||
def doall(problem,pos,neg,test,truth):
|
def doall(problem,pos,neg,test,truth):
|
||||||
print('[Start]{}'.format(problem))
|
print('[Start]{}'.format(problem))
|
||||||
feature_extractor = FeatureExtractor(function_words_freq=lang,
|
feature_extractor = FeatureExtractor(function_words_freq=lang,
|
||||||
|
conjugations_freq=lang,
|
||||||
features_Mendenhall=True,
|
features_Mendenhall=True,
|
||||||
wordngrams=False, tfidf_feat_selection_ratio=0.1,
|
wordngrams=False, tfidf_feat_selection_ratio=0.1,
|
||||||
charngrams=True, n_charngrams=[3, 4, 5],
|
charngrams=True, n_charngrams=[3, 4, 5],
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue