34 lines
1.1 KiB
Python
34 lines
1.1 KiB
Python
from nltk import word_tokenize
|
|
from nltk.corpus import stopwords
|
|
from nltk.stem import SnowballStemmer
|
|
|
|
from data.languages import NLTK_LANGMAP
|
|
|
|
|
|
def preprocess_documents(documents, lang):
|
|
tokens = NLTKStemTokenizer(lang, verbose=True)
|
|
sw = stopwords.words(NLTK_LANGMAP[lang])
|
|
return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents]
|
|
|
|
|
|
class NLTKStemTokenizer(object):
|
|
|
|
def __init__(self, lang, verbose=False):
|
|
if lang not in NLTK_LANGMAP:
|
|
raise ValueError('Language %s is not supported in NLTK' % lang)
|
|
self.verbose=verbose
|
|
self.called = 0
|
|
self.wnl = SnowballStemmer(NLTK_LANGMAP[lang])
|
|
self.cache = {}
|
|
|
|
def __call__(self, doc):
|
|
self.called += 1
|
|
if self.verbose:
|
|
print("\r\t\t[documents processed %d]" % (self.called), end="")
|
|
tokens = word_tokenize(doc)
|
|
stems = []
|
|
for t in tokens:
|
|
if t not in self.cache:
|
|
self.cache[t] = self.wnl.stem(t)
|
|
stems.append(self.cache[t])
|
|
return stems |