This commit is contained in:
Alejandro Moreo Fernandez 2020-01-17 18:16:29 +01:00
parent 73d1e70ae9
commit 1ba5e60031
4 changed files with 260 additions and 26 deletions

View File

@ -251,10 +251,10 @@ class NaivePolylingualClassifier:
assert self.model is not None, 'predict called before fit' assert self.model is not None, 'predict called before fit'
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict' assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict'
if self.n_jobs == 1: if self.n_jobs == 1:
return {lang:self.model[lang].predict(lX[lang]) for lang in lX.keys()} return {lang:self.model[lang].transform(lX[lang]) for lang in lX.keys()}
else: else:
langs = list(lX.keys()) langs = list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs) scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].transform)(lX[lang]) for lang in langs)
return {lang: scores[i] for i, lang in enumerate(langs)} return {lang: scores[i] for i, lang in enumerate(langs)}
def best_params(self): def best_params(self):
@ -397,7 +397,7 @@ class FunnellingMultimodal(FunnellingPolylingualClassifier):
if self.config['supervised'] or self.config['unsupervised']: if self.config['supervised'] or self.config['unsupervised']:
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
_embedding_space = self.embedding_space.predict(self.config, lX) _embedding_space = self.embedding_space.transform(self.config, lX)
if self.config['max_label_space'] == 0: if self.config['max_label_space'] == 0:
_cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1] _cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1]
if _cum_dimension - 300 > 0: if _cum_dimension - 300 > 0:
@ -414,7 +414,7 @@ class FunnellingMultimodal(FunnellingPolylingualClassifier):
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
self.standardizer = StandardizeTransformer() self.standardizer = StandardizeTransformer()
_vertical_Z = self.standardizer.fit_predict(_vertical_Z) _vertical_Z = self.standardizer.fit_transform(_vertical_Z)
# todo testing ... # todo testing ...
# if self.config['post_pca']: # if self.config['post_pca']:
@ -435,7 +435,7 @@ class FunnellingMultimodal(FunnellingPolylingualClassifier):
lZ = self._projection(self.doc_projector, lX) lZ = self._projection(self.doc_projector, lX)
if self.config['supervised'] or self.config['unsupervised']: if self.config['supervised'] or self.config['unsupervised']:
_embedding_space = self.embedding_space.predict(self.config, lX) _embedding_space = self.embedding_space.transform(self.config, lX)
for lang in lX.keys(): for lang in lX.keys():
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang])) lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
@ -443,7 +443,7 @@ class FunnellingMultimodal(FunnellingPolylingualClassifier):
for lang in lZ.keys(): for lang in lZ.keys():
print(lZ[lang].shape) print(lZ[lang].shape)
# todo testing # todo testing
lZ[lang] = self.standardizer.predict(lZ[lang]) lZ[lang] = self.standardizer.transform(lZ[lang])
# if self.config['post_pca']: # if self.config['post_pca']:
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') # print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
# lZ[lang] = self.pca_independent_space.transform(lZ[lang]) # lZ[lang] = self.pca_independent_space.transform(lZ[lang])
@ -545,7 +545,7 @@ class PolylingualEmbeddingsClassifier:
self.vectorize(lX) self.vectorize(lX)
# config = {'unsupervised' : False, 'supervised': True} # config = {'unsupervised' : False, 'supervised': True}
self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly) self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly)
WEtr = self.embedding_space.predict(self.config, lX) WEtr = self.embedding_space.transform(self.config, lX)
# for lang in langs: # for lang in langs:
# WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices # WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices
# Ytr.append(ly[lang]) # Ytr.append(ly[lang])
@ -567,9 +567,9 @@ class PolylingualEmbeddingsClassifier:
assert self.model is not None, 'predict called before fit' assert self.model is not None, 'predict called before fit'
self.vectorize(lX, prediction=True) self.vectorize(lX, prediction=True)
langs = list(lX.keys()) langs = list(lX.keys())
lWEte = self.embedding_space.predict(self.config, lX) lWEte = self.embedding_space.transform(self.config, lX)
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory # lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
return _joblib_transform_multiling(self.model.predict, lWEte, n_jobs=self.n_jobs) return _joblib_transform_multiling(self.model.transform, lWEte, n_jobs=self.n_jobs)
def predict_proba(self, lX): def predict_proba(self, lX):
""" """

View File

@ -0,0 +1,215 @@
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
#from data.text_preprocessor import NLTKStemTokenizer
from embeddings.supervised import supervised_embeddings_tfidf, zscores
from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling
import time
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
from scipy.sparse import issparse, vstack, hstack
# ------------------------------------------------------------------
# Data Processing
# ------------------------------------------------------------------
from transformers.StandardizeTransformer import StandardizeTransformer
from util.SIF_embed import remove_pc
class TfidfVectorizerMultilingual:
def __init__(self, **kwargs):
self.kwargs=kwargs
def fit(self, lX, ly=None):
self.langs = sorted(lX.keys())
self.vectorizer={l:TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
# tokenizer=NLTKStemTokenizer(l, verbose=True),
return self
def transform(self, lX):
return {l:self.vectorizer[l].transform(lX[l]) for l in self.langs}
def fit_transform(self, lX, ly=None):
return self.fit(lX,ly).transform(lX)
def vocabulary(self, l=None):
if l is None:
return {l:self.vectorizer[l].vocabulary_ for l in self.langs}
else:
return self.vectorizer[l].vocabulary_
# ------------------------------------------------------------------
# Document Embeddings
# ------------------------------------------------------------------
class PosteriorProbabilitiesEmbedder:
def __init__(self, first_tier_learner, first_tier_parameters,
n_jobs=-1):
self.fist_tier_learner = first_tier_learner
self.fist_tier_parameters = first_tier_parameters
self.n_jobs = n_jobs
self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner,
self.fist_tier_parameters,
n_jobs=n_jobs)
def fit(self, lX, lY):
print('fitting the projectors... {}'.format(lX.keys()))
self.doc_projector.fit(lX, lY)
return self
def transform(self, lX):
print('projecting the documents')
lZ = self.doc_projector.predict_proba(lX)
return lZ
def fit_transform(self, lX, ly=None):
return self.fit(lX, ly).transform(lX)
def best_params(self):
return self.doc_projector.best_params()
class WordClassEmbedder:
def __init__(self, n_jobs=-1, max_label_space=300):
self.n_jobs = n_jobs
self.max_label_space=max_label_space
def fit(self, lX, ly):
self.langs = sorted(lX.keys())
WCE = Parallel(n_jobs=self.n_jobs)(
delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs
)
self.lWCE = {l:WCE[i] for i,l in enumerate(self.langs)}
return self
def transform(self, lX):
lWCE = self.lWCE
XdotWCE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], lWCE[lang]) for lang in self.langs
)
return {l: XdotWCE[i] for i, l in enumerate(self.langs)}
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
def word_class_embedding_matrix(X, Y, max_label_space=300):
print('computing supervised embeddings...')
WCE = supervised_embeddings_tfidf(X, Y)
WCE = zscores(WCE, axis=0)
nC = Y.shape[1]
if nC > max_label_space:
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
f'Applying PCA(n_components={max_label_space})')
pca = PCA(n_components=max_label_space)
WCE = pca.fit(WCE).transform(WCE)
return WCE
def XdotM(X,M):
# return X.dot(M)
E = X.dot(M)
E = remove_pc(E, npc=1)
return E
class DocEmbedderList:
def __init__(self, *embedder_list):
self.embedders = embedder_list
def fit(self, lX, ly):
for transformer in self.embedders:
transformer.fit(lX,ly)
return self
def transform(self, lX):
if len(self.embedders)==1:
return self.embedders[0].transform(lX)
some_sparse = False
langs = sorted(lX.keys())
lZparts = {l: [] for l in langs}
for transformer in self.embedders:
lZ = transformer.transform(lX)
for l in langs:
Z = lZ[l]
some_sparse = some_sparse or issparse(Z)
lZparts[l].append(Z)
hstacker = hstack if some_sparse else np.hstack
return {l:hstacker(lZparts[l]) for l in langs}
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
def best_params(self):
return {'todo'}
# ------------------------------------------------------------------
# Meta-Classifier
# ------------------------------------------------------------------
class MetaClassifier:
def __init__(self, meta_learner, meta_parameters, n_jobs=-1):
self.n_jobs=n_jobs
self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
def fit(self, lZ, ly):
tinit = time.time()
Z, y = self.stack(lZ, ly)
self.standardizer = StandardizeTransformer()
Z = self.standardizer.fit_transform(Z)
print('fitting the Z-space of shape={}'.format(Z.shape))
self.model.fit(Z, y)
self.time = time.time() - tinit
def stack(self, lZ, ly=None):
langs = list(lZ.keys())
Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
if ly is not None:
y = np.vstack([ly[lang] for lang in langs])
return Z, y
else:
return Z
def predict(self, lZ, ly=None):
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
def best_params(self):
return self.model.best_params()
# ------------------------------------------------------------------
# Ensembling
# ------------------------------------------------------------------
class Funnelling:
def __init__(self,
vectorizer:TfidfVectorizerMultilingual,
first_tier:DocEmbedderList,
meta:MetaClassifier):
self.vectorizer = vectorizer
self.first_tier = first_tier
self.meta = meta
self.n_jobs = meta.n_jobs
def fit(self, lX, ly):
lX = self.vectorizer.fit_transform(lX, ly)
lZ = self.first_tier.fit_transform(lX, ly)
self.meta.fit(lZ, ly)
def predict(self, lX, ly=None):
lX = self.vectorizer.transform(lX)
lZ = self.first_tier.transform(lX)
ly_ = self.meta.predict(lZ)
return ly_
def best_params(self):
return {'1st-tier':self.first_tier.best_params(),
'meta':self.meta.best_params()}

View File

@ -1,12 +1,16 @@
import os import os
from dataset_builder import MultilingualDataset from dataset_builder import MultilingualDataset
from learning.learners import * # from learning.learners import *
from learning.learners import FunnellingMultimodal
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder
from util.evaluation import * from util.evaluation import *
from optparse import OptionParser from optparse import OptionParser
from util.file import exists from util.file import exists
from util.results import PolylingualClassificationResults from util.results import PolylingualClassificationResults
from sklearn.svm import SVC from sklearn.svm import SVC
from util.util import get_learner, get_params from util.util import get_learner, get_params
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
parser = OptionParser() parser = OptionParser()
@ -53,7 +57,13 @@ parser.add_option("-l", dest="lang", type=str)
def get_learner(calibrate=False, kernel='linear'): def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1,
# class_weight='balanced',
gamma='auto')
def get_params(dense=False): def get_params(dense=False):
@ -88,7 +98,7 @@ if __name__ == '__main__':
if op.set_c != -1: if op.set_c != -1:
meta_parameters = None meta_parameters = None
else: else:
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}] meta_parameters = [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
# Embeddings and WCE config # Embeddings and WCE config
_available_mode = ['none', 'unsupervised', 'supervised', 'both'] _available_mode = ['none', 'unsupervised', 'supervised', 'both']
@ -126,13 +136,22 @@ if __name__ == '__main__':
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
print(f'### PolyEmbedd_andrea_{_config_id}\n') print(f'### PolyEmbedd_andrea_{_config_id}\n')
classifier = FunnellingMultimodal(we_path=op.we_path, # classifier = FunnellingMultimodal(we_path=op.we_path,
config=config, # config=config,
first_tier_learner=get_learner(calibrate=True), # first_tier_learner=get_learner(calibrate=True),
meta_learner=get_learner(calibrate=False, kernel='rbf'), # meta_learner=get_learner(calibrate=False, kernel='rbf'),
first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not? # first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not?
meta_parameters=get_params(dense=True), # meta_parameters=get_params(dense=True),
n_jobs=op.n_jobs) # n_jobs=op.n_jobs)
tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
post_prob = PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None)
wce_proj = WordClassEmbedder()
doc_embedder = DocEmbedderList(post_prob, wce_proj)
# doc_embedder = DocEmbedderList(post_prob)
meta = MetaClassifier(meta_learner=SVC(), meta_parameters=get_params(dense=True))
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
print('# Fitting ...') print('# Fitting ...')
classifier.fit(lXtr, lytr) classifier.fit(lXtr, lytr)
@ -145,8 +164,8 @@ if __name__ == '__main__':
macrof1, microf1, macrok, microk = l_eval[lang] macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk]) metrics.append([macrof1, microf1, macrok, microk])
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
(config['max_label_space'], classifier.best_components), # (config['max_label_space'], classifier.best_components),
config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, # config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
lang, macrof1, microf1, macrok, microk, '') # lang, macrof1, microf1, macrok, microk, '')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

View File

@ -15,9 +15,9 @@ class StandardizeTransformer:
print('done\n') print('done\n')
return self return self
def predict(self, X): def transform(self, X):
if not self.yetfit: 'transform called before fit' if not self.yetfit: 'transform called before fit'
return (X - self.mean) / self.std return (X - self.mean) / self.std
def fit_predict(self, X): def fit_transform(self, X):
return self.fit(X).predict(X) return self.fit(X).transform(X)