diff --git a/src/dante_eval.py b/src/dante_eval.py index b1dd7be..daa7c1b 100644 --- a/src/dante_eval.py +++ b/src/dante_eval.py @@ -12,7 +12,7 @@ from util.color_visualization import color # TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection # TODO: sentence length (Mendenhall-style) ? -for epistola in [1,2]: +for epistola in [2]: print('Epistola {}'.format(epistola)) print('='*80) path = '../testi_{}'.format(epistola) @@ -20,6 +20,7 @@ for epistola in [1,2]: path+='_with_GuidoDaPisa' positive, negative, ep_text = load_texts(path, unknown_target='EpistolaXIII_{}.txt'.format(epistola)) + n_full_docs = len(positive) + len(negative) feature_extractor = FeatureExtractor(function_words_freq='latin', conjugations_freq='latin', @@ -27,15 +28,17 @@ for epistola in [1,2]: tfidf_feat_selection_ratio=0.1, wordngrams=False, n_wordngrams=(1, 2), charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False, - split_documents=False, split_policy=split_by_sentences, window_size=3, + split_documents=True, split_policy=split_by_sentences, window_size=3, normalize_features=True) - Xtr,ytr = feature_extractor.fit_transform(positive, negative) + Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative) + print(ytr) + ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3) print('Fitting the Verificator') av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression) - av.fit(Xtr,ytr) + av.fit(Xtr,ytr,groups) print('Predicting the Epistola {}'.format(epistola)) title = 'Epistola {}'.format('I' if epistola==1 else 'II') @@ -43,8 +46,12 @@ for epistola in [1,2]: fulldoc_prob, fragment_probs = av.predict_proba(ep, title) # color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title) - param = 'All' - # with open('features{}.csv'.format(epistola), 'at') as fo: - # validation=av.estimator.best_score_.mean() - # nfeatures = Xtr.shape[1] - # fo.write('{}\t{}\t{:.0f}\t{:.3f}\t{:.3f}\n'.format(param, value, nfeatures, validation, fulldoc_prob)) + score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False) + print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + + score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True) + print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + + score_ave, score_std = av.leave_one_out(Xtr, ytr, None) + print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + diff --git a/src/data/features.py b/src/data/features.py index 8c69924..9f254e3 100644 --- a/src/data/features.py +++ b/src/data/features.py @@ -49,6 +49,10 @@ latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amus 'sim', 'sis', 'sit', 'simus', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemus', 'essetis', 'essent', 'fui', 'fuisti', 'fuit', 'fuimus', 'fuistis', 'fuerunt', 'este', 'esto', 'estote', 'sunto'] +spanish_conjugations = ['o','as','a','amos','áis','an','es','e','emos','éis','en','imos','ís','guir','ger','gir', + 'ar','er','ir','é','aste','ó','asteis','aron','í','iste','ió','isteis','ieron', + 'aba', 'abas', 'ábamos', 'aban', 'ía', 'ías', 'íamos', 'íais', 'ían', 'ás','á', + 'án','estoy','estás','está','estamos','estáis','están'] def get_conjugations(lang): if lang == 'latin': @@ -95,17 +99,19 @@ def windows(text_fragments, window_size): def splitter(documents, authors=None, split_policy=split_by_sentences, window_size=1): fragments = [] authors_fragments = [] + groups = [] for i, text in enumerate(documents): text_fragments = split_policy(text) text_fragments = windows(text_fragments, window_size=window_size) fragments.extend(text_fragments) + groups.extend([i]*len(text_fragments)) if authors is not None: authors_fragments.extend([authors[i]] * len(text_fragments)) if authors is not None: - return fragments, authors_fragments + return fragments, authors_fragments, groups - return fragments + return fragments, groups def tokenize(text): @@ -280,17 +286,20 @@ class FeatureExtractor: documents = positives + negatives authors = [1]*len(positives) + [0]*len(negatives) n_original_docs = len(documents) + groups = list(range(n_original_docs)) if self.split_documents: - doc_fragments, authors_fragments = splitter(documents, authors, + doc_fragments, authors_fragments, groups_fragments = splitter(documents, authors, split_policy=self.split_policy, window_size=self.window_size) documents.extend(doc_fragments) authors.extend(authors_fragments) + groups.extend(groups_fragments) self._print('splitting documents: {} documents'.format(len(doc_fragments))) # represent the target vector y = np.array(authors) + groups = np.array(groups) # initialize the document-by-feature vector X = np.empty((len(documents), 0)) @@ -345,7 +354,7 @@ class FeatureExtractor: print('y prevalence: {}/{} {:.2f}%'.format(y.sum(),len(y),y.mean() * 100)) print() - return X, y + return X, y, groups def transform(self, test, return_fragments=False, window_size=-1): @@ -354,7 +363,8 @@ class FeatureExtractor: window_size = self.window_size if self.split_documents: - test.extend(splitter(test, split_policy=self.split_policy, window_size=window_size)) + tests, _ = splitter(test, split_policy=self.split_policy, window_size=window_size) + test.extend(tests) # initialize the document-by-feature vector TEST = np.empty((len(test), 0)) diff --git a/src/model.py b/src/model.py index a7e4c59..a6b06a1 100644 --- a/src/model.py +++ b/src/model.py @@ -1,7 +1,8 @@ from util import disable_sklearn_warnings from sklearn.metrics import f1_score from sklearn.metrics import make_scorer -from sklearn.model_selection import GridSearchCV, LeaveOneOut +from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, GroupKFold, KFold, \ + StratifiedKFold from sklearn.linear_model import LogisticRegression from sklearn.svm import * from data.features import * @@ -29,7 +30,7 @@ def f1(true_labels, predicted_labels): class AuthorshipVerificator: def __init__(self, nfolds=10, - params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}, + params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced']}, estimator=SVC): self.nfolds = nfolds self.params = params @@ -44,13 +45,16 @@ class AuthorshipVerificator: self.probability = True self.svm = LogisticRegression() - def fit(self,X,y): + def fit(self,X,y,groups=None): if not isinstance(y,np.ndarray): y=np.array(y) positive_examples = y.sum() - if True or positive_examples >= self.nfolds: + if positive_examples >= self.nfolds: print('optimizing {}'.format(self.svm.__class__.__name__)) - self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=LeaveOneOut(), scoring=make_scorer(f1), n_jobs=-1, verbose=10) - # self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=self.nfolds, scoring=make_scorer(f1), n_jobs=-1, verbose=10) + # if groups is None or len(np.unique(groups[y==1])): + folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y)) + # folds = list(GroupKFold(n_splits=self.nfolds).split(X,y,groups)) + + self.estimator = GridSearchCV(self.svm, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1) else: self.estimator = self.svm @@ -62,9 +66,28 @@ class AuthorshipVerificator: f1scores = self.estimator.best_score_ f1_mean, f1_std = f1scores.mean(), f1scores.std() print('F1-measure={:.3f} (+-{:.3f} cv={})\n'.format(f1_mean, f1_std, f1scores)) + self.estimator = self.estimator.best_estimator_ return self + def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True): + + if groups is None: + print('Computing LOO without groups') + folds = list(LeaveOneOut().split(X,y)) + else: + print('Computing LOO with groups') + logo = LeaveOneGroupOut() + folds=list(logo.split(X,y,groups)) + if test_lowest_index_only: + print('ignoring fragments') + folds = [(train, np.min(test, keepdims=True)) for train, test in folds] + + scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1) + print(scores) + + return scores.mean(), scores.std() + def predict(self, test, epistola_name=''): pred = self.estimator.predict(test) full_doc_prediction = pred[0]