some cleaning
This commit is contained in:
parent
c0de604df7
commit
b4796f4882
|
|
@ -2,15 +2,6 @@ from sklearn.linear_model import LogisticRegression
|
||||||
from data.dante_loader import load_texts
|
from data.dante_loader import load_texts
|
||||||
from data.features import *
|
from data.features import *
|
||||||
from model import AuthorshipVerificator, f1_from_counters
|
from model import AuthorshipVerificator, f1_from_counters
|
||||||
from sklearn.svm import LinearSVC, SVC
|
|
||||||
from util.color_visualization import color
|
|
||||||
|
|
||||||
# DONE: ngrams should contain punctuation marks according to Sapkota et al. [39] in the PAN 2015 overview
|
|
||||||
# (More recently, it was shown that character
|
|
||||||
# n-grams corresponding to word affixes and including punctuation marks are the most
|
|
||||||
# significant features in cross-topic authorship attribution [57].) #we have cancelled the
|
|
||||||
# TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection
|
|
||||||
# TODO: sentence length (Mendenhall-style) ?
|
|
||||||
|
|
||||||
|
|
||||||
for epistola in [1]:
|
for epistola in [1]:
|
||||||
|
|
@ -37,7 +28,9 @@ for epistola in [1]:
|
||||||
if epistola==2:
|
if epistola==2:
|
||||||
path+='_interaEpistola'
|
path+='_interaEpistola'
|
||||||
|
|
||||||
positive, negative, pos_files, neg_files, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola))
|
positive, negative, pos_files, neg_files, ep_text = load_texts(
|
||||||
|
path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola)
|
||||||
|
)
|
||||||
files = np.asarray(pos_files + neg_files)
|
files = np.asarray(pos_files + neg_files)
|
||||||
if len(positive) < 2:
|
if len(positive) < 2:
|
||||||
discarded+=1
|
discarded+=1
|
||||||
|
|
@ -45,16 +38,20 @@ for epistola in [1]:
|
||||||
|
|
||||||
n_full_docs = len(positive) + len(negative)
|
n_full_docs = len(positive) + len(negative)
|
||||||
|
|
||||||
feature_extractor = FeatureExtractor(function_words_freq='latin',
|
feature_extractor = FeatureExtractor(
|
||||||
conjugations_freq='latin',
|
function_words_freq='latin',
|
||||||
features_Mendenhall=True,
|
conjugations_freq='latin',
|
||||||
features_sentenceLengths=True,
|
features_Mendenhall=True,
|
||||||
tfidf_feat_selection_ratio=0.1,
|
features_sentenceLengths=True,
|
||||||
wordngrams=True, n_wordngrams=(1, 2),
|
tfidf_feat_selection_ratio=0.1,
|
||||||
charngrams=True, n_charngrams=(3, 4, 5),
|
wordngrams=True, n_wordngrams=(1, 2),
|
||||||
preserve_punctuation=False,
|
charngrams=True, n_charngrams=(3, 4, 5),
|
||||||
split_documents=True, split_policy=split_by_sentences, window_size=3,
|
preserve_punctuation=False,
|
||||||
normalize_features=True)
|
split_documents=True,
|
||||||
|
split_policy=split_by_sentences,
|
||||||
|
window_size=3,
|
||||||
|
normalize_features=True
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative)
|
||||||
|
|
@ -66,22 +63,23 @@ for epistola in [1]:
|
||||||
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
|
av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression)
|
||||||
av.fit(Xtr,ytr,groups)
|
av.fit(Xtr,ytr,groups)
|
||||||
|
|
||||||
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True)
|
score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(
|
||||||
# print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std))
|
Xtr, ytr, files, groups, test_lowest_index_only=True, counters=True
|
||||||
|
)
|
||||||
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
|
f1_scores.append(f1_from_counters(tp, fp, fn, tn))
|
||||||
counters.append((tp, fp, fn, tn))
|
counters.append((tp, fp, fn, tn))
|
||||||
print('F1 for {} = {:.3f}'.format(author,f1_scores[-1]))
|
print('F1 for {author} = {f1_scores[-1]:.3f}')
|
||||||
|
|
||||||
|
|
||||||
print('Computing macro- and micro-averages (discarded {}/{})'.format(discarded,len(authors)))
|
print(f'Computing macro- and micro-averages (discarded {discarded}/{len(authors)})')
|
||||||
f1_scores = np.array(f1_scores)
|
f1_scores = np.array(f1_scores)
|
||||||
counters = np.array(counters)
|
counters = np.array(counters)
|
||||||
|
|
||||||
macro_f1 = f1_scores.mean()
|
macro_f1 = f1_scores.mean()
|
||||||
micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
|
micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist())
|
||||||
|
|
||||||
print('Macro-F1 = {:.3f}'.format(macro_f1))
|
print(f'Macro-F1 = {macro_f1:.3f}')
|
||||||
print('Micro-F1 = {:.3f}'.format(micro_f1))
|
print(f'Micro-F1 = {micro_f1:.3f}')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -229,6 +229,11 @@ def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)):
|
||||||
return features, tfidf_vectorizer
|
return features, tfidf_vectorizer
|
||||||
|
|
||||||
|
|
||||||
|
# We have implemented ngrams extration generically, following Sapkota et al. (ref [39] in the PAN 2015 overview), i.e.,
|
||||||
|
# containing punctuation marks. However, this does not apply to this study since punctuation marks are filtered-out in
|
||||||
|
# editions of Latin texts.
|
||||||
|
# More recently, it was shown that character n-grams corresponding to word affixes and including punctuation
|
||||||
|
# marks are the most significant features in cross-topic authorship attribution [57].
|
||||||
def _features_ngrams(documents, ns=[4, 5], ngrams_vectorizer=None, min_df = 10, preserve_punctuation=True):
|
def _features_ngrams(documents, ns=[4, 5], ngrams_vectorizer=None, min_df = 10, preserve_punctuation=True):
|
||||||
doc_ngrams = ngrams_extractor(documents, ns, preserve_punctuation)
|
doc_ngrams = ngrams_extractor(documents, ns, preserve_punctuation)
|
||||||
return _features_tfidf(doc_ngrams, tfidf_vectorizer=ngrams_vectorizer, min_df = min_df)
|
return _features_tfidf(doc_ngrams, tfidf_vectorizer=ngrams_vectorizer, min_df = min_df)
|
||||||
|
|
@ -507,6 +512,7 @@ class FeatureExtractor:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__=='__main__':
|
if __name__=='__main__':
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
|
||||||
|
|
@ -594,4 +600,3 @@ if __name__=='__main__':
|
||||||
dots=False
|
dots=False
|
||||||
print(counter)
|
print(counter)
|
||||||
print('rows',rows)
|
print('rows',rows)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue