import numpy as np from joblib import Parallel, delayed from vgfs.commons import XdotM, _normalize from vgfs.viewGen import ViewGen class WceGen(ViewGen): def __init__(self, n_jobs=-1): print("- init Word-Class-Embeddings View Generating Function") self.n_jobs = -1 self.sif = True def fit(self, lX, lY): print("- fitting Word-Class-Embeddings View Generating Function") lX = self.vectorizer.transform(lX) self.langs = sorted(lX.keys()) wce = Parallel(n_jobs=self.n_jobs)( delayed(wce_matrix)(lX[lang], lY[lang]) for lang in self.langs ) self.l_wce = {lang: wce[i] for i, lang in enumerate(self.langs)} return self def transform(self, lX): lX = self.vectorizer.transform(lX) XdotWce = Parallel(n_jobs=self.n_jobs)( delayed(XdotM)(lX[lang], self.l_wce[lang], sif=self.sif) for lang in self.langs ) lZ = {l: XdotWce[i] for i, l in enumerate(self.langs)} lZ = _normalize(lZ, l2=True) return lZ def fit_transform(self, lX, lY): return self.fit(lX, lY).transform(lX) def get_config(self): return { "name": "Word-Class Embeddings VGF", "n_jobs": self.n_jobs, "sif": self.sif, } def __str__(self): _str = f"[WordClass VGF (w)]\n- sif: {self.sif}\n- n_jobs: {self.n_jobs}\n" return _str def save_vgf(self, model_id): import pickle from os.path import join from os import makedirs vgf_name = "wordClassGen" _basedir = join("models", "vgfs", "wordclass") makedirs(_basedir, exist_ok=True) _path = join(_basedir, f"{vgf_name}_{model_id}.pkl") with open(_path, "wb") as f: pickle.dump(self, f) return self def wce_matrix(X, Y): wce = supervised_embeddings_tfidf(X, Y) wce = zscores(wce, axis=0) return wce def supervised_embeddings_tfidf(X, Y): tfidf_norm = X.sum(axis=0) tfidf_norm[tfidf_norm == 0] = 1 F = (X.T).dot(Y) / tfidf_norm.T return np.asarray(F) def zscores(X, axis=0): """ scipy.stats.zscores does not avoid division by 0, which can indeed occur :param X: :param axis: :return: """ std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None) mean = np.mean(X, axis=axis) return (X - mean) / std