word ngrams
This commit is contained in:
parent
893cc31225
commit
4dd578bc60
|
|
@ -18,7 +18,8 @@ positive, negative, ep1_text, ep2_text = load_texts(path)
|
|||
|
||||
feature_extractor = FeatureExtractor(function_words_freq='latin', features_Mendenhall=True,
|
||||
tfidf=False, tfidf_feat_selection_ratio=0.1,
|
||||
ngrams=True, ns=[3,4,5],
|
||||
wordngrams=(4,5),
|
||||
ngrams=True, ns=[4,5],
|
||||
split_documents=True,
|
||||
split_policy=split_by_sentences,
|
||||
window_size=3,
|
||||
|
|
|
|||
|
|
@ -124,7 +124,7 @@ def _features_Mendenhall(documents, upto=23):
|
|||
return np.array(features)
|
||||
|
||||
|
||||
def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1):
|
||||
def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)):
|
||||
"""
|
||||
Extract features as tfidf matrix extracted from the documents
|
||||
:param documents: a list where each element is the text (string) of a document
|
||||
|
|
@ -132,7 +132,7 @@ def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1):
|
|||
distinct words; and V is the TfidfVectorizer already fit
|
||||
"""
|
||||
if tfidf_vectorizer is None:
|
||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df)
|
||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams)
|
||||
tfidf_vectorizer.fit(documents)
|
||||
|
||||
features = tfidf_vectorizer.transform(documents)
|
||||
|
|
@ -179,6 +179,7 @@ class FeatureExtractor:
|
|||
features_Mendenhall=True,
|
||||
tfidf=False,
|
||||
tfidf_feat_selection_ratio=1.,
|
||||
wordngrams=(1,1),
|
||||
ngrams=False,
|
||||
ns=[4, 5],
|
||||
split_documents=False,
|
||||
|
|
@ -209,6 +210,7 @@ class FeatureExtractor:
|
|||
self.features_Mendenhall = features_Mendenhall
|
||||
self.tfidf = tfidf
|
||||
self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio
|
||||
self.wordngrams = wordngrams
|
||||
self.ngrams = ngrams
|
||||
self.ns = ns
|
||||
self.split_documents = split_documents
|
||||
|
|
@ -248,7 +250,7 @@ class FeatureExtractor:
|
|||
|
||||
# sparse feature extraction functions
|
||||
if self.tfidf:
|
||||
X_features, vectorizer = _features_tfidf(documents)
|
||||
X_features, vectorizer = _features_tfidf(documents, ngrams=self.wordngrams)
|
||||
self.tfidf_vectorizer = vectorizer
|
||||
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
|
|
@ -260,7 +262,7 @@ class FeatureExtractor:
|
|||
self._print('adding tfidf words features: {} features'.format(X.shape[1]))
|
||||
|
||||
if self.ngrams:
|
||||
X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5 * self.window_size)
|
||||
X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5)
|
||||
self.ngrams_vectorizer = vectorizer
|
||||
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
|
|
@ -269,7 +271,7 @@ class FeatureExtractor:
|
|||
self.feat_sel_ngrams = feat_sel
|
||||
|
||||
X = self._addfeatures(_tocsr(X), X_features)
|
||||
self._print('adding ngrams words features: {} features'.format(X.shape[1]))
|
||||
self._print('adding ngrams character features: {} features'.format(X.shape[1]))
|
||||
|
||||
# print summary
|
||||
if self.verbose:
|
||||
|
|
|
|||
|
|
@ -54,15 +54,6 @@ def doall(problem,pos,neg,test,truth):
|
|||
print('[End]{}'.format(problem))
|
||||
return problem, probability, prediction, truth
|
||||
|
||||
# print('{}-->{:.3f} decision={}'.format(problem, probability, prediction))
|
||||
# print('pred={} truth={}'.format(prediction, truth))
|
||||
#
|
||||
# y_prob.append(probability)
|
||||
# y_pred.append(prediction)
|
||||
# y_true.append(truth)
|
||||
#
|
||||
# acc_auc = evaluation(y_pred, y_prob, y_true)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
@ -74,7 +65,7 @@ if __name__ == '__main__':
|
|||
outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request))
|
||||
y_pred, y_prob, y_true = [], [], []
|
||||
for problem, probability, prediction, truth in outcomes:
|
||||
fo.write('{} {:.3f}\n'.format(problem, probability))
|
||||
fo.write('{} {}\n'.format(problem, probability))
|
||||
y_pred.append(prediction)
|
||||
y_prob.append(probability)
|
||||
y_true.append(truth)
|
||||
|
|
|
|||
Loading…
Reference in New Issue