from sklearn.preprocessing import normalize from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD import numpy as np def _normalize(lX, l2=True): return {lang: normalize(np.asarray(X)) for lang, X in lX.items()} if l2 else lX def XdotM(X, M, sif): E = X.dot(M) if sif: E = remove_pc(E, npc=1) return E def remove_pc(X, npc=1): """ Remove the projection on the principal components :param X: X[i,:] is a data point :param npc: number of principal components to remove :return: XX[i, :] is the data point after removing its projection """ pc = compute_pc(X, npc) if npc == 1: XX = X - X.dot(pc.transpose()) * pc else: XX = X - X.dot(pc.transpose()).dot(pc) return XX class TfidfVectorizerMultilingual: def __init__(self, **kwargs): self.kwargs = kwargs def fit(self, lX, ly=None): self.langs = sorted(lX.keys()) self.vectorizer = { l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs } return self def transform(self, lX): return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} def fit_transform(self, lX, ly=None): return self.fit(lX, ly).transform(lX) def vocabulary(self, l=None): if l is None: return {l: self.vectorizer[l].vocabulary_ for l in self.langs} else: return self.vectorizer[l].vocabulary_ def get_analyzer(self, l=None): if l is None: return {l: self.vectorizer[l].build_analyzer() for l in self.langs} else: return self.vectorizer[l].build_analyzer() def compute_pc(X, npc=1): """ Compute the principal components. :param X: X[i,:] is a data point :param npc: number of principal components to remove :return: component_[i,:] is the i-th pc """ if isinstance(X, np.matrix): X = np.asarray(X) svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) svd.fit(X) return svd.components_