word ngrams

This commit is contained in:
Alejandro Moreo Fernandez 2018-11-29 17:50:14 +01:00
parent 893cc31225
commit 4dd578bc60
3 changed files with 10 additions and 16 deletions

View File

@ -18,7 +18,8 @@ positive, negative, ep1_text, ep2_text = load_texts(path)
feature_extractor = FeatureExtractor(function_words_freq='latin', features_Mendenhall=True, feature_extractor = FeatureExtractor(function_words_freq='latin', features_Mendenhall=True,
tfidf=False, tfidf_feat_selection_ratio=0.1, tfidf=False, tfidf_feat_selection_ratio=0.1,
ngrams=True, ns=[3,4,5], wordngrams=(4,5),
ngrams=True, ns=[4,5],
split_documents=True, split_documents=True,
split_policy=split_by_sentences, split_policy=split_by_sentences,
window_size=3, window_size=3,

View File

@ -124,7 +124,7 @@ def _features_Mendenhall(documents, upto=23):
return np.array(features) return np.array(features)
def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1): def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)):
""" """
Extract features as tfidf matrix extracted from the documents Extract features as tfidf matrix extracted from the documents
:param documents: a list where each element is the text (string) of a document :param documents: a list where each element is the text (string) of a document
@ -132,7 +132,7 @@ def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1):
distinct words; and V is the TfidfVectorizer already fit distinct words; and V is the TfidfVectorizer already fit
""" """
if tfidf_vectorizer is None: if tfidf_vectorizer is None:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df) tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams)
tfidf_vectorizer.fit(documents) tfidf_vectorizer.fit(documents)
features = tfidf_vectorizer.transform(documents) features = tfidf_vectorizer.transform(documents)
@ -179,6 +179,7 @@ class FeatureExtractor:
features_Mendenhall=True, features_Mendenhall=True,
tfidf=False, tfidf=False,
tfidf_feat_selection_ratio=1., tfidf_feat_selection_ratio=1.,
wordngrams=(1,1),
ngrams=False, ngrams=False,
ns=[4, 5], ns=[4, 5],
split_documents=False, split_documents=False,
@ -209,6 +210,7 @@ class FeatureExtractor:
self.features_Mendenhall = features_Mendenhall self.features_Mendenhall = features_Mendenhall
self.tfidf = tfidf self.tfidf = tfidf
self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio
self.wordngrams = wordngrams
self.ngrams = ngrams self.ngrams = ngrams
self.ns = ns self.ns = ns
self.split_documents = split_documents self.split_documents = split_documents
@ -248,7 +250,7 @@ class FeatureExtractor:
# sparse feature extraction functions # sparse feature extraction functions
if self.tfidf: if self.tfidf:
X_features, vectorizer = _features_tfidf(documents) X_features, vectorizer = _features_tfidf(documents, ngrams=self.wordngrams)
self.tfidf_vectorizer = vectorizer self.tfidf_vectorizer = vectorizer
if self.tfidf_feat_selection_ratio < 1.: if self.tfidf_feat_selection_ratio < 1.:
@ -260,7 +262,7 @@ class FeatureExtractor:
self._print('adding tfidf words features: {} features'.format(X.shape[1])) self._print('adding tfidf words features: {} features'.format(X.shape[1]))
if self.ngrams: if self.ngrams:
X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5 * self.window_size) X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5)
self.ngrams_vectorizer = vectorizer self.ngrams_vectorizer = vectorizer
if self.tfidf_feat_selection_ratio < 1.: if self.tfidf_feat_selection_ratio < 1.:
@ -269,7 +271,7 @@ class FeatureExtractor:
self.feat_sel_ngrams = feat_sel self.feat_sel_ngrams = feat_sel
X = self._addfeatures(_tocsr(X), X_features) X = self._addfeatures(_tocsr(X), X_features)
self._print('adding ngrams words features: {} features'.format(X.shape[1])) self._print('adding ngrams character features: {} features'.format(X.shape[1]))
# print summary # print summary
if self.verbose: if self.verbose:

View File

@ -54,15 +54,6 @@ def doall(problem,pos,neg,test,truth):
print('[End]{}'.format(problem)) print('[End]{}'.format(problem))
return problem, probability, prediction, truth return problem, probability, prediction, truth
# print('{}-->{:.3f} decision={}'.format(problem, probability, prediction))
# print('pred={} truth={}'.format(prediction, truth))
#
# y_prob.append(probability)
# y_pred.append(prediction)
# y_true.append(truth)
#
# acc_auc = evaluation(y_pred, y_prob, y_true)
if __name__ == '__main__': if __name__ == '__main__':
@ -74,7 +65,7 @@ if __name__ == '__main__':
outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request)) outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request))
y_pred, y_prob, y_true = [], [], [] y_pred, y_prob, y_true = [], [], []
for problem, probability, prediction, truth in outcomes: for problem, probability, prediction, truth in outcomes:
fo.write('{} {:.3f}\n'.format(problem, probability)) fo.write('{} {}\n'.format(problem, probability))
y_pred.append(prediction) y_pred.append(prediction)
y_prob.append(probability) y_prob.append(probability)
y_true.append(truth) y_true.append(truth)