75 lines
2.0 KiB
Python
75 lines
2.0 KiB
Python
from sklearn.preprocessing import normalize
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.decomposition import TruncatedSVD
|
|
import numpy as np
|
|
|
|
|
|
def _normalize(lX, l2=True):
|
|
return {lang: normalize(np.asarray(X)) for lang, X in lX.items()} if l2 else lX
|
|
|
|
|
|
def XdotM(X, M, sif):
|
|
E = X.dot(M)
|
|
if sif:
|
|
E = remove_pc(E, npc=1)
|
|
return E
|
|
|
|
|
|
def remove_pc(X, npc=1):
|
|
"""
|
|
Remove the projection on the principal components
|
|
:param X: X[i,:] is a data point
|
|
:param npc: number of principal components to remove
|
|
:return: XX[i, :] is the data point after removing its projection
|
|
"""
|
|
pc = compute_pc(X, npc)
|
|
if npc == 1:
|
|
XX = X - X.dot(pc.transpose()) * pc
|
|
else:
|
|
XX = X - X.dot(pc.transpose()).dot(pc)
|
|
return XX
|
|
|
|
|
|
class TfidfVectorizerMultilingual:
|
|
def __init__(self, **kwargs):
|
|
self.kwargs = kwargs
|
|
|
|
def fit(self, lX, ly=None):
|
|
self.langs = sorted(lX.keys())
|
|
self.vectorizer = {
|
|
l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs
|
|
}
|
|
return self
|
|
|
|
def transform(self, lX):
|
|
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
|
|
|
|
def fit_transform(self, lX, ly=None):
|
|
return self.fit(lX, ly).transform(lX)
|
|
|
|
def vocabulary(self, l=None):
|
|
if l is None:
|
|
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
|
|
else:
|
|
return self.vectorizer[l].vocabulary_
|
|
|
|
def get_analyzer(self, l=None):
|
|
if l is None:
|
|
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
|
|
else:
|
|
return self.vectorizer[l].build_analyzer()
|
|
|
|
|
|
def compute_pc(X, npc=1):
|
|
"""
|
|
Compute the principal components.
|
|
:param X: X[i,:] is a data point
|
|
:param npc: number of principal components to remove
|
|
:return: component_[i,:] is the i-th pc
|
|
"""
|
|
if isinstance(X, np.matrix):
|
|
X = np.asarray(X)
|
|
svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
|
|
svd.fit(X)
|
|
return svd.components_
|