diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 4b19b4a..91cb9ee 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -276,6 +276,7 @@ class StorageEmbeddings: self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs) return self + def predict(self, config, docs): if config['supervised'] and config['unsupervised']: return self._concatenate_embeddings(docs) @@ -288,3 +289,4 @@ class StorageEmbeddings: for lang in docs.keys(): _r[lang] = docs[lang].dot(self.lang_U[lang]) return _r + diff --git a/src/data/supervised.py b/src/data/supervised.py index d2d7aab..b1faa2d 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -12,6 +12,7 @@ def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which def supervised_embeddings_tfidf(X,Y): tfidf_norm = X.sum(axis=0) + tfidf_norm[tfidf_norm==0] = 1 F = (X.T).dot(Y) / tfidf_norm.T return F diff --git a/src/util/util.py b/src/util/util.py new file mode 100644 index 0000000..e69de29