refactor
This commit is contained in:
parent
73d1e70ae9
commit
1ba5e60031
|
|
@ -251,10 +251,10 @@ class NaivePolylingualClassifier:
|
|||
assert self.model is not None, 'predict called before fit'
|
||||
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict'
|
||||
if self.n_jobs == 1:
|
||||
return {lang:self.model[lang].predict(lX[lang]) for lang in lX.keys()}
|
||||
return {lang:self.model[lang].transform(lX[lang]) for lang in lX.keys()}
|
||||
else:
|
||||
langs = list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs)
|
||||
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].transform)(lX[lang]) for lang in langs)
|
||||
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def best_params(self):
|
||||
|
|
@ -397,7 +397,7 @@ class FunnellingMultimodal(FunnellingPolylingualClassifier):
|
|||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
|
||||
_embedding_space = self.embedding_space.predict(self.config, lX)
|
||||
_embedding_space = self.embedding_space.transform(self.config, lX)
|
||||
if self.config['max_label_space'] == 0:
|
||||
_cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1]
|
||||
if _cum_dimension - 300 > 0:
|
||||
|
|
@ -414,7 +414,7 @@ class FunnellingMultimodal(FunnellingPolylingualClassifier):
|
|||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||
|
||||
self.standardizer = StandardizeTransformer()
|
||||
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
|
||||
_vertical_Z = self.standardizer.fit_transform(_vertical_Z)
|
||||
|
||||
# todo testing ...
|
||||
# if self.config['post_pca']:
|
||||
|
|
@ -435,7 +435,7 @@ class FunnellingMultimodal(FunnellingPolylingualClassifier):
|
|||
lZ = self._projection(self.doc_projector, lX)
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
_embedding_space = self.embedding_space.predict(self.config, lX)
|
||||
_embedding_space = self.embedding_space.transform(self.config, lX)
|
||||
|
||||
for lang in lX.keys():
|
||||
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
|
||||
|
|
@ -443,7 +443,7 @@ class FunnellingMultimodal(FunnellingPolylingualClassifier):
|
|||
for lang in lZ.keys():
|
||||
print(lZ[lang].shape)
|
||||
# todo testing
|
||||
lZ[lang] = self.standardizer.predict(lZ[lang])
|
||||
lZ[lang] = self.standardizer.transform(lZ[lang])
|
||||
# if self.config['post_pca']:
|
||||
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
|
||||
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
|
||||
|
|
@ -545,7 +545,7 @@ class PolylingualEmbeddingsClassifier:
|
|||
self.vectorize(lX)
|
||||
# config = {'unsupervised' : False, 'supervised': True}
|
||||
self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly)
|
||||
WEtr = self.embedding_space.predict(self.config, lX)
|
||||
WEtr = self.embedding_space.transform(self.config, lX)
|
||||
# for lang in langs:
|
||||
# WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices
|
||||
# Ytr.append(ly[lang])
|
||||
|
|
@ -567,9 +567,9 @@ class PolylingualEmbeddingsClassifier:
|
|||
assert self.model is not None, 'predict called before fit'
|
||||
self.vectorize(lX, prediction=True)
|
||||
langs = list(lX.keys())
|
||||
lWEte = self.embedding_space.predict(self.config, lX)
|
||||
lWEte = self.embedding_space.transform(self.config, lX)
|
||||
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
||||
return _joblib_transform_multiling(self.model.predict, lWEte, n_jobs=self.n_jobs)
|
||||
return _joblib_transform_multiling(self.model.transform, lWEte, n_jobs=self.n_jobs)
|
||||
|
||||
def predict_proba(self, lX):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -0,0 +1,215 @@
|
|||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
#from data.text_preprocessor import NLTKStemTokenizer
|
||||
from embeddings.supervised import supervised_embeddings_tfidf, zscores
|
||||
from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling
|
||||
import time
|
||||
from sklearn.decomposition import PCA
|
||||
from joblib import Parallel, delayed
|
||||
from scipy.sparse import issparse, vstack, hstack
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Data Processing
|
||||
# ------------------------------------------------------------------
|
||||
from transformers.StandardizeTransformer import StandardizeTransformer
|
||||
from util.SIF_embed import remove_pc
|
||||
|
||||
|
||||
class TfidfVectorizerMultilingual:
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.kwargs=kwargs
|
||||
|
||||
def fit(self, lX, ly=None):
|
||||
self.langs = sorted(lX.keys())
|
||||
self.vectorizer={l:TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
|
||||
# tokenizer=NLTKStemTokenizer(l, verbose=True),
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
return {l:self.vectorizer[l].transform(lX[l]) for l in self.langs}
|
||||
|
||||
def fit_transform(self, lX, ly=None):
|
||||
return self.fit(lX,ly).transform(lX)
|
||||
|
||||
def vocabulary(self, l=None):
|
||||
if l is None:
|
||||
return {l:self.vectorizer[l].vocabulary_ for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].vocabulary_
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Document Embeddings
|
||||
# ------------------------------------------------------------------
|
||||
class PosteriorProbabilitiesEmbedder:
|
||||
|
||||
def __init__(self, first_tier_learner, first_tier_parameters,
|
||||
n_jobs=-1):
|
||||
self.fist_tier_learner = first_tier_learner
|
||||
self.fist_tier_parameters = first_tier_parameters
|
||||
self.n_jobs = n_jobs
|
||||
self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner,
|
||||
self.fist_tier_parameters,
|
||||
n_jobs=n_jobs)
|
||||
|
||||
def fit(self, lX, lY):
|
||||
print('fitting the projectors... {}'.format(lX.keys()))
|
||||
self.doc_projector.fit(lX, lY)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
print('projecting the documents')
|
||||
lZ = self.doc_projector.predict_proba(lX)
|
||||
return lZ
|
||||
|
||||
def fit_transform(self, lX, ly=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def best_params(self):
|
||||
return self.doc_projector.best_params()
|
||||
|
||||
|
||||
class WordClassEmbedder:
|
||||
|
||||
def __init__(self, n_jobs=-1, max_label_space=300):
|
||||
self.n_jobs = n_jobs
|
||||
self.max_label_space=max_label_space
|
||||
|
||||
def fit(self, lX, ly):
|
||||
self.langs = sorted(lX.keys())
|
||||
WCE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs
|
||||
)
|
||||
self.lWCE = {l:WCE[i] for i,l in enumerate(self.langs)}
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
lWCE = self.lWCE
|
||||
XdotWCE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], lWCE[lang]) for lang in self.langs
|
||||
)
|
||||
return {l: XdotWCE[i] for i, l in enumerate(self.langs)}
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
def word_class_embedding_matrix(X, Y, max_label_space=300):
|
||||
print('computing supervised embeddings...')
|
||||
WCE = supervised_embeddings_tfidf(X, Y)
|
||||
WCE = zscores(WCE, axis=0)
|
||||
|
||||
nC = Y.shape[1]
|
||||
if nC > max_label_space:
|
||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
f'Applying PCA(n_components={max_label_space})')
|
||||
pca = PCA(n_components=max_label_space)
|
||||
WCE = pca.fit(WCE).transform(WCE)
|
||||
|
||||
return WCE
|
||||
|
||||
|
||||
def XdotM(X,M):
|
||||
# return X.dot(M)
|
||||
E = X.dot(M)
|
||||
E = remove_pc(E, npc=1)
|
||||
return E
|
||||
|
||||
|
||||
class DocEmbedderList:
|
||||
def __init__(self, *embedder_list):
|
||||
self.embedders = embedder_list
|
||||
|
||||
def fit(self, lX, ly):
|
||||
for transformer in self.embedders:
|
||||
transformer.fit(lX,ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
if len(self.embedders)==1:
|
||||
return self.embedders[0].transform(lX)
|
||||
|
||||
some_sparse = False
|
||||
langs = sorted(lX.keys())
|
||||
|
||||
lZparts = {l: [] for l in langs}
|
||||
for transformer in self.embedders:
|
||||
lZ = transformer.transform(lX)
|
||||
for l in langs:
|
||||
Z = lZ[l]
|
||||
some_sparse = some_sparse or issparse(Z)
|
||||
lZparts[l].append(Z)
|
||||
|
||||
hstacker = hstack if some_sparse else np.hstack
|
||||
return {l:hstacker(lZparts[l]) for l in langs}
|
||||
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def best_params(self):
|
||||
return {'todo'}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Meta-Classifier
|
||||
# ------------------------------------------------------------------
|
||||
class MetaClassifier:
|
||||
|
||||
def __init__(self, meta_learner, meta_parameters, n_jobs=-1):
|
||||
self.n_jobs=n_jobs
|
||||
self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
|
||||
|
||||
def fit(self, lZ, ly):
|
||||
tinit = time.time()
|
||||
Z, y = self.stack(lZ, ly)
|
||||
self.standardizer = StandardizeTransformer()
|
||||
Z = self.standardizer.fit_transform(Z)
|
||||
print('fitting the Z-space of shape={}'.format(Z.shape))
|
||||
self.model.fit(Z, y)
|
||||
self.time = time.time() - tinit
|
||||
|
||||
def stack(self, lZ, ly=None):
|
||||
langs = list(lZ.keys())
|
||||
Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
|
||||
if ly is not None:
|
||||
y = np.vstack([ly[lang] for lang in langs])
|
||||
return Z, y
|
||||
else:
|
||||
return Z
|
||||
|
||||
def predict(self, lZ, ly=None):
|
||||
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
|
||||
def best_params(self):
|
||||
return self.model.best_params()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Ensembling
|
||||
# ------------------------------------------------------------------
|
||||
class Funnelling:
|
||||
def __init__(self,
|
||||
vectorizer:TfidfVectorizerMultilingual,
|
||||
first_tier:DocEmbedderList,
|
||||
meta:MetaClassifier):
|
||||
self.vectorizer = vectorizer
|
||||
self.first_tier = first_tier
|
||||
self.meta = meta
|
||||
self.n_jobs = meta.n_jobs
|
||||
|
||||
def fit(self, lX, ly):
|
||||
lX = self.vectorizer.fit_transform(lX, ly)
|
||||
lZ = self.first_tier.fit_transform(lX, ly)
|
||||
self.meta.fit(lZ, ly)
|
||||
|
||||
def predict(self, lX, ly=None):
|
||||
lX = self.vectorizer.transform(lX)
|
||||
lZ = self.first_tier.transform(lX)
|
||||
ly_ = self.meta.predict(lZ)
|
||||
return ly_
|
||||
|
||||
def best_params(self):
|
||||
return {'1st-tier':self.first_tier.best_params(),
|
||||
'meta':self.meta.best_params()}
|
||||
|
||||
|
|
@ -1,12 +1,16 @@
|
|||
import os
|
||||
from dataset_builder import MultilingualDataset
|
||||
from learning.learners import *
|
||||
# from learning.learners import *
|
||||
from learning.learners import FunnellingMultimodal
|
||||
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
|
||||
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder
|
||||
from util.evaluation import *
|
||||
from optparse import OptionParser
|
||||
from util.file import exists
|
||||
from util.results import PolylingualClassificationResults
|
||||
from sklearn.svm import SVC
|
||||
from util.util import get_learner, get_params
|
||||
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||
|
||||
parser = OptionParser()
|
||||
|
||||
|
|
@ -53,7 +57,13 @@ parser.add_option("-l", dest="lang", type=str)
|
|||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1,
|
||||
|
||||
|
||||
# class_weight='balanced',
|
||||
|
||||
|
||||
gamma='auto')
|
||||
|
||||
|
||||
def get_params(dense=False):
|
||||
|
|
@ -88,7 +98,7 @@ if __name__ == '__main__':
|
|||
if op.set_c != -1:
|
||||
meta_parameters = None
|
||||
else:
|
||||
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
|
||||
meta_parameters = [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||
|
||||
# Embeddings and WCE config
|
||||
_available_mode = ['none', 'unsupervised', 'supervised', 'both']
|
||||
|
|
@ -126,13 +136,22 @@ if __name__ == '__main__':
|
|||
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
||||
|
||||
print(f'### PolyEmbedd_andrea_{_config_id}\n')
|
||||
classifier = FunnellingMultimodal(we_path=op.we_path,
|
||||
config=config,
|
||||
first_tier_learner=get_learner(calibrate=True),
|
||||
meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
||||
first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not?
|
||||
meta_parameters=get_params(dense=True),
|
||||
n_jobs=op.n_jobs)
|
||||
# classifier = FunnellingMultimodal(we_path=op.we_path,
|
||||
# config=config,
|
||||
# first_tier_learner=get_learner(calibrate=True),
|
||||
# meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
||||
# first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not?
|
||||
# meta_parameters=get_params(dense=True),
|
||||
# n_jobs=op.n_jobs)
|
||||
|
||||
tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
post_prob = PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None)
|
||||
wce_proj = WordClassEmbedder()
|
||||
doc_embedder = DocEmbedderList(post_prob, wce_proj)
|
||||
# doc_embedder = DocEmbedderList(post_prob)
|
||||
meta = MetaClassifier(meta_learner=SVC(), meta_parameters=get_params(dense=True))
|
||||
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
|
||||
|
||||
|
||||
print('# Fitting ...')
|
||||
classifier.fit(lXtr, lytr)
|
||||
|
|
@ -145,8 +164,8 @@ if __name__ == '__main__':
|
|||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||
results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
||||
(config['max_label_space'], classifier.best_components),
|
||||
config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
||||
lang, macrof1, microf1, macrok, microk, '')
|
||||
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
||||
# (config['max_label_space'], classifier.best_components),
|
||||
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
||||
# lang, macrof1, microf1, macrok, microk, '')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||
|
|
|
|||
|
|
@ -15,9 +15,9 @@ class StandardizeTransformer:
|
|||
print('done\n')
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
def transform(self, X):
|
||||
if not self.yetfit: 'transform called before fit'
|
||||
return (X - self.mean) / self.std
|
||||
|
||||
def fit_predict(self, X):
|
||||
return self.fit(X).predict(X)
|
||||
def fit_transform(self, X):
|
||||
return self.fit(X).transform(X)
|
||||
|
|
|
|||
Loading…
Reference in New Issue