gFun/refactor/funnelling.py

125 lines
4.3 KiB
Python

from models.learners import *
from util.common import _normalize
from view_generators import VanillaFunGen
class DocEmbedderList:
"""
Class that takes care of calling fit and transform function for every init embedder. Every ViewGenerator should be
contained by this class in order to seamlessly train the overall architecture.
"""
def __init__(self, embedder_list, probabilistic=True):
"""
Init the DocEmbedderList.
:param embedder_list: list of embedders to be deployed
:param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not
"""
assert len(embedder_list) != 0, 'Embedder list cannot be empty!'
self.embedders = embedder_list
self.probabilistic = probabilistic
if probabilistic:
_tmp = []
for embedder in self.embedders:
if isinstance(embedder, VanillaFunGen):
_tmp.append(embedder)
else:
_tmp.append(FeatureSet2Posteriors(embedder))
self.embedders = _tmp
def fit(self, lX, ly):
"""
Fit all the ViewGenerators contained by DocEmbedderList.
:param lX:
:param ly:
:return: self
"""
for embedder in self.embedders:
embedder.fit(lX, ly)
return self
def transform(self, lX):
"""
Project documents by means of every ViewGenerators. Projections are then averaged together and returned.
:param lX:
:return: common latent space (averaged).
"""
langs = sorted(lX.keys())
lZparts = {lang: None for lang in langs}
for embedder in self.embedders:
lZ = embedder.transform(lX)
for lang in langs:
Z = lZ[lang]
if lZparts[lang] is None:
lZparts[lang] = Z
else:
lZparts[lang] += Z
n_embedders = len(self.embedders)
return {lang: lZparts[lang]/n_embedders for lang in langs} # Averaging feature spaces
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
class FeatureSet2Posteriors:
"""
Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of
a multiclass SVM.
"""
def __init__(self, embedder, l2=True, n_jobs=-1):
"""
Init the class.
:param embedder: ViewGen, view generators which does not natively outputs posterior probabilities.
:param l2: bool, whether to apply or not L2 normalization to the projection
:param n_jobs: int, number of concurrent workers.
"""
self.embedder = embedder
self.l2 = l2
self.n_jobs = n_jobs
self.prob_classifier = MetaClassifier(
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
def fit(self, lX, ly):
lZ = self.embedder.fit_transform(lX, ly)
self.prob_classifier.fit(lZ, ly)
return self
def transform(self, lX):
lP = self.predict_proba(lX)
lP = _normalize(lP, self.l2)
return lP
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
def predict(self, lX):
lZ = self.embedder.transform(lX)
return self.prob_classifier.predict(lZ)
def predict_proba(self, lX):
lZ = self.embedder.transform(lX)
return self.prob_classifier.predict_proba(lZ)
class Funnelling:
"""
Funnelling Architecture. It is composed by two tiers. The first-tier is a set of heterogeneous document embedders.
The second-tier (i.e., the metaclassifier), operates the classification of the common latent space computed by
the first-tier learners.
"""
def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1):
self.first_tier = first_tier
self.meta = meta_classifier
self.n_jobs = n_jobs
def fit(self, lX, ly):
print('## Fitting first-tier learners!')
lZ = self.first_tier.fit_transform(lX, ly)
print('## Fitting meta-learner!')
self.meta.fit(lZ, ly)
def predict(self, lX):
lZ = self.first_tier.transform(lX)
ly = self.meta.predict(lZ)
return ly