word ngrams
This commit is contained in:
parent
893cc31225
commit
4dd578bc60
|
|
@ -18,7 +18,8 @@ positive, negative, ep1_text, ep2_text = load_texts(path)
|
||||||
|
|
||||||
feature_extractor = FeatureExtractor(function_words_freq='latin', features_Mendenhall=True,
|
feature_extractor = FeatureExtractor(function_words_freq='latin', features_Mendenhall=True,
|
||||||
tfidf=False, tfidf_feat_selection_ratio=0.1,
|
tfidf=False, tfidf_feat_selection_ratio=0.1,
|
||||||
ngrams=True, ns=[3,4,5],
|
wordngrams=(4,5),
|
||||||
|
ngrams=True, ns=[4,5],
|
||||||
split_documents=True,
|
split_documents=True,
|
||||||
split_policy=split_by_sentences,
|
split_policy=split_by_sentences,
|
||||||
window_size=3,
|
window_size=3,
|
||||||
|
|
|
||||||
|
|
@ -124,7 +124,7 @@ def _features_Mendenhall(documents, upto=23):
|
||||||
return np.array(features)
|
return np.array(features)
|
||||||
|
|
||||||
|
|
||||||
def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1):
|
def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)):
|
||||||
"""
|
"""
|
||||||
Extract features as tfidf matrix extracted from the documents
|
Extract features as tfidf matrix extracted from the documents
|
||||||
:param documents: a list where each element is the text (string) of a document
|
:param documents: a list where each element is the text (string) of a document
|
||||||
|
|
@ -132,7 +132,7 @@ def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1):
|
||||||
distinct words; and V is the TfidfVectorizer already fit
|
distinct words; and V is the TfidfVectorizer already fit
|
||||||
"""
|
"""
|
||||||
if tfidf_vectorizer is None:
|
if tfidf_vectorizer is None:
|
||||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df)
|
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=min_df, ngram_range=ngrams)
|
||||||
tfidf_vectorizer.fit(documents)
|
tfidf_vectorizer.fit(documents)
|
||||||
|
|
||||||
features = tfidf_vectorizer.transform(documents)
|
features = tfidf_vectorizer.transform(documents)
|
||||||
|
|
@ -179,6 +179,7 @@ class FeatureExtractor:
|
||||||
features_Mendenhall=True,
|
features_Mendenhall=True,
|
||||||
tfidf=False,
|
tfidf=False,
|
||||||
tfidf_feat_selection_ratio=1.,
|
tfidf_feat_selection_ratio=1.,
|
||||||
|
wordngrams=(1,1),
|
||||||
ngrams=False,
|
ngrams=False,
|
||||||
ns=[4, 5],
|
ns=[4, 5],
|
||||||
split_documents=False,
|
split_documents=False,
|
||||||
|
|
@ -209,6 +210,7 @@ class FeatureExtractor:
|
||||||
self.features_Mendenhall = features_Mendenhall
|
self.features_Mendenhall = features_Mendenhall
|
||||||
self.tfidf = tfidf
|
self.tfidf = tfidf
|
||||||
self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio
|
self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio
|
||||||
|
self.wordngrams = wordngrams
|
||||||
self.ngrams = ngrams
|
self.ngrams = ngrams
|
||||||
self.ns = ns
|
self.ns = ns
|
||||||
self.split_documents = split_documents
|
self.split_documents = split_documents
|
||||||
|
|
@ -248,7 +250,7 @@ class FeatureExtractor:
|
||||||
|
|
||||||
# sparse feature extraction functions
|
# sparse feature extraction functions
|
||||||
if self.tfidf:
|
if self.tfidf:
|
||||||
X_features, vectorizer = _features_tfidf(documents)
|
X_features, vectorizer = _features_tfidf(documents, ngrams=self.wordngrams)
|
||||||
self.tfidf_vectorizer = vectorizer
|
self.tfidf_vectorizer = vectorizer
|
||||||
|
|
||||||
if self.tfidf_feat_selection_ratio < 1.:
|
if self.tfidf_feat_selection_ratio < 1.:
|
||||||
|
|
@ -260,7 +262,7 @@ class FeatureExtractor:
|
||||||
self._print('adding tfidf words features: {} features'.format(X.shape[1]))
|
self._print('adding tfidf words features: {} features'.format(X.shape[1]))
|
||||||
|
|
||||||
if self.ngrams:
|
if self.ngrams:
|
||||||
X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5 * self.window_size)
|
X_features, vectorizer = _features_ngrams(documents, self.ns, min_df=5)
|
||||||
self.ngrams_vectorizer = vectorizer
|
self.ngrams_vectorizer = vectorizer
|
||||||
|
|
||||||
if self.tfidf_feat_selection_ratio < 1.:
|
if self.tfidf_feat_selection_ratio < 1.:
|
||||||
|
|
@ -269,7 +271,7 @@ class FeatureExtractor:
|
||||||
self.feat_sel_ngrams = feat_sel
|
self.feat_sel_ngrams = feat_sel
|
||||||
|
|
||||||
X = self._addfeatures(_tocsr(X), X_features)
|
X = self._addfeatures(_tocsr(X), X_features)
|
||||||
self._print('adding ngrams words features: {} features'.format(X.shape[1]))
|
self._print('adding ngrams character features: {} features'.format(X.shape[1]))
|
||||||
|
|
||||||
# print summary
|
# print summary
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
|
|
|
||||||
|
|
@ -54,15 +54,6 @@ def doall(problem,pos,neg,test,truth):
|
||||||
print('[End]{}'.format(problem))
|
print('[End]{}'.format(problem))
|
||||||
return problem, probability, prediction, truth
|
return problem, probability, prediction, truth
|
||||||
|
|
||||||
# print('{}-->{:.3f} decision={}'.format(problem, probability, prediction))
|
|
||||||
# print('pred={} truth={}'.format(prediction, truth))
|
|
||||||
#
|
|
||||||
# y_prob.append(probability)
|
|
||||||
# y_pred.append(prediction)
|
|
||||||
# y_true.append(truth)
|
|
||||||
#
|
|
||||||
# acc_auc = evaluation(y_pred, y_prob, y_true)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
@ -74,7 +65,7 @@ if __name__ == '__main__':
|
||||||
outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request))
|
outcomes = Parallel(n_jobs=-1)(delayed(doall)(problem,pos,neg,test,truth) for problem,pos,neg,test,truth in TaskGenerator(request))
|
||||||
y_pred, y_prob, y_true = [], [], []
|
y_pred, y_prob, y_true = [], [], []
|
||||||
for problem, probability, prediction, truth in outcomes:
|
for problem, probability, prediction, truth in outcomes:
|
||||||
fo.write('{} {:.3f}\n'.format(problem, probability))
|
fo.write('{} {}\n'.format(problem, probability))
|
||||||
y_pred.append(prediction)
|
y_pred.append(prediction)
|
||||||
y_prob.append(probability)
|
y_prob.append(probability)
|
||||||
y_true.append(truth)
|
y_true.append(truth)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue