diff --git a/src/dante_eval.py b/src/dante_eval.py index 0eec487..2ffe9b1 100644 --- a/src/dante_eval.py +++ b/src/dante_eval.py @@ -18,7 +18,8 @@ positive, negative, ep1_text, ep2_text = load_texts(path) feature_extractor = FeatureExtractor(function_words_freq='latin', features_Mendenhall=True, tfidf=False, tfidf_feat_selection_ratio=0.1, - ngrams=True, ns=[3,4,5], + wordngrams=(4,5), + ngrams=True, ns=[4,5], split_documents=True, split_policy=split_by_sentences, window_size=3, diff --git a/src/data/features.py b/src/data/features.py index 06ac20e..085e3a8 100644 --- a/src/data/features.py +++ b/src/data/features.py @@ -124,7 +124,7 @@ def _features_Mendenhall(documents, upto=23): return np.array(features) -def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1): +def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)): """ Extract features as tfidf matrix extracted from the documents :param documents: a list where each element is the text (string) of a document @@ -132,7 +132,7 @@ def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1): distinct words; and V is the TfidfVectorizer already fit """ if tfidf_vectorizer is None: - tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df) + tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams) tfidf_vectorizer.fit(documents) features = tfidf_vectorizer.transform(documents) @@ -179,6 +179,7 @@ class FeatureExtractor: features_Mendenhall=True, tfidf=False, tfidf_feat_selection_ratio=1., + wordngrams=(1,1), ngrams=False, ns=[4, 5], split_documents=False, @@ -209,6 +210,7 @@ class FeatureExtractor: self.features_Mendenhall = features_Mendenhall self.tfidf = tfidf self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio + self.wordngrams = wordngrams self.ngrams = ngrams self.ns = ns self.split_documents = split_documents @@ -248,7 +250,7 @@ class FeatureExtractor: # sparse feature extraction functions if self.tfidf: - X_features, vectorizer = _features_tfidf(documents) + X_features, vectorizer = _features_tfidf(documents, ngrams=self.wordngrams) self.tfidf_vectorizer = vectorizer if self.tfidf_feat_selection_ratio < 1.: @@ -260,7 +262,7 @@ class FeatureExtractor: self._print('adding tfidf words features: {} features'.format(X.shape[1])) if self.ngrams: - X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5 * self.window_size) + X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5) self.ngrams_vectorizer = vectorizer if self.tfidf_feat_selection_ratio < 1.: @@ -269,7 +271,7 @@ class FeatureExtractor: self.feat_sel_ngrams = feat_sel X = self._addfeatures(_tocsr(X), X_features) - self._print('adding ngrams words features: {} features'.format(X.shape[1])) + self._print('adding ngrams character features: {} features'.format(X.shape[1])) # print summary if self.verbose: diff --git a/src/pan2015_eval.py b/src/pan2015_eval.py index 6dcb4c9..b1b57f5 100644 --- a/src/pan2015_eval.py +++ b/src/pan2015_eval.py @@ -54,15 +54,6 @@ def doall(problem,pos,neg,test,truth): print('[End]{}'.format(problem)) return problem, probability, prediction, truth - # print('{}-->{:.3f} decision={}'.format(problem, probability, prediction)) - # print('pred={} truth={}'.format(prediction, truth)) - # - # y_prob.append(probability) - # y_pred.append(prediction) - # y_true.append(truth) - # - # acc_auc = evaluation(y_pred, y_prob, y_true) - if __name__ == '__main__': @@ -74,7 +65,7 @@ if __name__ == '__main__': outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request)) y_pred, y_prob, y_true = [], [], [] for problem, probability, prediction, truth in outcomes: - fo.write('{} {:.3f}\n'.format(problem, probability)) + fo.write('{} {}\n'.format(problem, probability)) y_pred.append(prediction) y_prob.append(probability) y_true.append(truth)