From 108f423d415c79e0e58b306f66ad191d85ec640d Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 26 Jan 2021 10:15:55 +0100 Subject: [PATCH] Implemented funnelling architecture --- refactor/util/SIF_embed.py | 5 ++++- refactor/util/common.py | 2 +- refactor/view_generators.py | 15 +++++++++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/refactor/util/SIF_embed.py b/refactor/util/SIF_embed.py index cfe096e..4a3d712 100644 --- a/refactor/util/SIF_embed.py +++ b/refactor/util/SIF_embed.py @@ -1,6 +1,7 @@ import numpy as np from sklearn.decomposition import TruncatedSVD + def get_weighted_average(We, x, w): """ Compute the weighted average vectors @@ -15,6 +16,7 @@ def get_weighted_average(We, x, w): emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:]) return emb + def compute_pc(X,npc=1): """ Compute the principal components. @@ -26,6 +28,7 @@ def compute_pc(X,npc=1): svd.fit(X) return svd.components_ + def remove_pc(X, npc=1): """ Remove the projection on the principal components @@ -34,7 +37,7 @@ def remove_pc(X, npc=1): :return: XX[i, :] is the data point after removing its projection """ pc = compute_pc(X, npc) - if npc==1: + if npc == 1: XX = X - X.dot(pc.transpose()) * pc else: XX = X - X.dot(pc.transpose()).dot(pc) diff --git a/refactor/util/common.py b/refactor/util/common.py index 3ffda78..a624528 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -368,4 +368,4 @@ def get_params(optimc=False): return None c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] kernel = 'rbf' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] \ No newline at end of file + return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] diff --git a/refactor/view_generators.py b/refactor/view_generators.py index 579b8f1..2d82a20 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -41,17 +41,20 @@ class ViewGen(ABC): class VanillaFunGen(ViewGen): - def __init__(self, base_learner, n_jobs=-1): + def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1): """ Original funnelling architecture proposed by Moreo, Esuli and Sebastiani in DOI: https://doi.org/10.1145/3326065 :param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to return posterior probabilities. + :param base_learner: :param n_jobs: integer, number of concurrent workers """ super().__init__() self.learners = base_learner + self.first_tier_parameters = first_tier_parameters self.n_jobs = n_jobs - self.doc_projector = NaivePolylingualClassifier(self.learners) + self.doc_projector = NaivePolylingualClassifier(base_learner=self.learners, + parameters=self.first_tier_parameters, n_jobs=self.n_jobs) self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) def fit(self, lX, lY): @@ -61,8 +64,16 @@ class VanillaFunGen(ViewGen): return self def transform(self, lX): + """ + (1) Vectorize documents + (2) Project them according to the learners SVMs + (3) Apply L2 normalization to the projection + :param lX: + :return: + """ lX = self.vectorizer.transform(lX) lZ = self.doc_projector.predict_proba(lX) + lZ = _normalize(lZ, l2=True) return lZ def fit_transform(self, lX, ly):