gFun/refactor/data/text_preprocessor.py

34 lines
1.1 KiB
Python

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from data.languages import NLTK_LANGMAP
def preprocess_documents(documents, lang):
tokens = NLTKStemTokenizer(lang, verbose=True)
sw = stopwords.words(NLTK_LANGMAP[lang])
return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents]
class NLTKStemTokenizer(object):
def __init__(self, lang, verbose=False):
if lang not in NLTK_LANGMAP:
raise ValueError('Language %s is not supported in NLTK' % lang)
self.verbose=verbose
self.called = 0
self.wnl = SnowballStemmer(NLTK_LANGMAP[lang])
self.cache = {}
def __call__(self, doc):
self.called += 1
if self.verbose:
print("\r\t\t[documents processed %d]" % (self.called), end="")
tokens = word_tokenize(doc)
stems = []
for t in tokens:
if t not in self.cache:
self.cache[t] = self.wnl.stem(t)
stems.append(self.cache[t])
return stems