diff --git a/src/learning/learners.py b/src/learning/learners.py index 2a42666..ee87f2c 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -251,10 +251,10 @@ class NaivePolylingualClassifier: assert self.model is not None, 'predict called before fit' assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict' if self.n_jobs == 1: - return {lang:self.model[lang].predict(lX[lang]) for lang in lX.keys()} + return {lang:self.model[lang].transform(lX[lang]) for lang in lX.keys()} else: langs = list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs) + scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].transform)(lX[lang]) for lang in langs) return {lang: scores[i] for i, lang in enumerate(langs)} def best_params(self): @@ -397,7 +397,7 @@ class FunnellingMultimodal(FunnellingPolylingualClassifier): if self.config['supervised'] or self.config['unsupervised']: self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) - _embedding_space = self.embedding_space.predict(self.config, lX) + _embedding_space = self.embedding_space.transform(self.config, lX) if self.config['max_label_space'] == 0: _cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1] if _cum_dimension - 300 > 0: @@ -414,7 +414,7 @@ class FunnellingMultimodal(FunnellingPolylingualClassifier): _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) self.standardizer = StandardizeTransformer() - _vertical_Z = self.standardizer.fit_predict(_vertical_Z) + _vertical_Z = self.standardizer.fit_transform(_vertical_Z) # todo testing ... # if self.config['post_pca']: @@ -435,7 +435,7 @@ class FunnellingMultimodal(FunnellingPolylingualClassifier): lZ = self._projection(self.doc_projector, lX) if self.config['supervised'] or self.config['unsupervised']: - _embedding_space = self.embedding_space.predict(self.config, lX) + _embedding_space = self.embedding_space.transform(self.config, lX) for lang in lX.keys(): lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang])) @@ -443,7 +443,7 @@ class FunnellingMultimodal(FunnellingPolylingualClassifier): for lang in lZ.keys(): print(lZ[lang].shape) # todo testing - lZ[lang] = self.standardizer.predict(lZ[lang]) + lZ[lang] = self.standardizer.transform(lZ[lang]) # if self.config['post_pca']: # print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') # lZ[lang] = self.pca_independent_space.transform(lZ[lang]) @@ -545,7 +545,7 @@ class PolylingualEmbeddingsClassifier: self.vectorize(lX) # config = {'unsupervised' : False, 'supervised': True} self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly) - WEtr = self.embedding_space.predict(self.config, lX) + WEtr = self.embedding_space.transform(self.config, lX) # for lang in langs: # WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices # Ytr.append(ly[lang]) @@ -567,9 +567,9 @@ class PolylingualEmbeddingsClassifier: assert self.model is not None, 'predict called before fit' self.vectorize(lX, prediction=True) langs = list(lX.keys()) - lWEte = self.embedding_space.predict(self.config, lX) + lWEte = self.embedding_space.transform(self.config, lX) # lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory - return _joblib_transform_multiling(self.model.predict, lWEte, n_jobs=self.n_jobs) + return _joblib_transform_multiling(self.model.transform, lWEte, n_jobs=self.n_jobs) def predict_proba(self, lX): """ diff --git a/src/learning/transformers.py b/src/learning/transformers.py new file mode 100644 index 0000000..9aa861a --- /dev/null +++ b/src/learning/transformers.py @@ -0,0 +1,215 @@ +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +#from data.text_preprocessor import NLTKStemTokenizer +from embeddings.supervised import supervised_embeddings_tfidf, zscores +from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling +import time +from sklearn.decomposition import PCA +from joblib import Parallel, delayed +from scipy.sparse import issparse, vstack, hstack + +# ------------------------------------------------------------------ +# Data Processing +# ------------------------------------------------------------------ +from transformers.StandardizeTransformer import StandardizeTransformer +from util.SIF_embed import remove_pc + + +class TfidfVectorizerMultilingual: + + def __init__(self, **kwargs): + self.kwargs=kwargs + + def fit(self, lX, ly=None): + self.langs = sorted(lX.keys()) + self.vectorizer={l:TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} + # tokenizer=NLTKStemTokenizer(l, verbose=True), + return self + + def transform(self, lX): + return {l:self.vectorizer[l].transform(lX[l]) for l in self.langs} + + def fit_transform(self, lX, ly=None): + return self.fit(lX,ly).transform(lX) + + def vocabulary(self, l=None): + if l is None: + return {l:self.vectorizer[l].vocabulary_ for l in self.langs} + else: + return self.vectorizer[l].vocabulary_ + + +# ------------------------------------------------------------------ +# Document Embeddings +# ------------------------------------------------------------------ +class PosteriorProbabilitiesEmbedder: + + def __init__(self, first_tier_learner, first_tier_parameters, + n_jobs=-1): + self.fist_tier_learner = first_tier_learner + self.fist_tier_parameters = first_tier_parameters + self.n_jobs = n_jobs + self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, + self.fist_tier_parameters, + n_jobs=n_jobs) + + def fit(self, lX, lY): + print('fitting the projectors... {}'.format(lX.keys())) + self.doc_projector.fit(lX, lY) + return self + + def transform(self, lX): + print('projecting the documents') + lZ = self.doc_projector.predict_proba(lX) + return lZ + + def fit_transform(self, lX, ly=None): + return self.fit(lX, ly).transform(lX) + + def best_params(self): + return self.doc_projector.best_params() + + +class WordClassEmbedder: + + def __init__(self, n_jobs=-1, max_label_space=300): + self.n_jobs = n_jobs + self.max_label_space=max_label_space + + def fit(self, lX, ly): + self.langs = sorted(lX.keys()) + WCE = Parallel(n_jobs=self.n_jobs)( + delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs + ) + self.lWCE = {l:WCE[i] for i,l in enumerate(self.langs)} + return self + + def transform(self, lX): + lWCE = self.lWCE + XdotWCE = Parallel(n_jobs=self.n_jobs)( + delayed(XdotM)(lX[lang], lWCE[lang]) for lang in self.langs + ) + return {l: XdotWCE[i] for i, l in enumerate(self.langs)} + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +def word_class_embedding_matrix(X, Y, max_label_space=300): + print('computing supervised embeddings...') + WCE = supervised_embeddings_tfidf(X, Y) + WCE = zscores(WCE, axis=0) + + nC = Y.shape[1] + if nC > max_label_space: + print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + f'Applying PCA(n_components={max_label_space})') + pca = PCA(n_components=max_label_space) + WCE = pca.fit(WCE).transform(WCE) + + return WCE + + +def XdotM(X,M): + # return X.dot(M) + E = X.dot(M) + E = remove_pc(E, npc=1) + return E + + +class DocEmbedderList: + def __init__(self, *embedder_list): + self.embedders = embedder_list + + def fit(self, lX, ly): + for transformer in self.embedders: + transformer.fit(lX,ly) + return self + + def transform(self, lX): + if len(self.embedders)==1: + return self.embedders[0].transform(lX) + + some_sparse = False + langs = sorted(lX.keys()) + + lZparts = {l: [] for l in langs} + for transformer in self.embedders: + lZ = transformer.transform(lX) + for l in langs: + Z = lZ[l] + some_sparse = some_sparse or issparse(Z) + lZparts[l].append(Z) + + hstacker = hstack if some_sparse else np.hstack + return {l:hstacker(lZparts[l]) for l in langs} + + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + def best_params(self): + return {'todo'} + +# ------------------------------------------------------------------ +# Meta-Classifier +# ------------------------------------------------------------------ +class MetaClassifier: + + def __init__(self, meta_learner, meta_parameters, n_jobs=-1): + self.n_jobs=n_jobs + self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs) + + def fit(self, lZ, ly): + tinit = time.time() + Z, y = self.stack(lZ, ly) + self.standardizer = StandardizeTransformer() + Z = self.standardizer.fit_transform(Z) + print('fitting the Z-space of shape={}'.format(Z.shape)) + self.model.fit(Z, y) + self.time = time.time() - tinit + + def stack(self, lZ, ly=None): + langs = list(lZ.keys()) + Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space + if ly is not None: + y = np.vstack([ly[lang] for lang in langs]) + return Z, y + else: + return Z + + def predict(self, lZ, ly=None): + lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) + return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) + + def best_params(self): + return self.model.best_params() + +# ------------------------------------------------------------------ +# Ensembling +# ------------------------------------------------------------------ +class Funnelling: + def __init__(self, + vectorizer:TfidfVectorizerMultilingual, + first_tier:DocEmbedderList, + meta:MetaClassifier): + self.vectorizer = vectorizer + self.first_tier = first_tier + self.meta = meta + self.n_jobs = meta.n_jobs + + def fit(self, lX, ly): + lX = self.vectorizer.fit_transform(lX, ly) + lZ = self.first_tier.fit_transform(lX, ly) + self.meta.fit(lZ, ly) + + def predict(self, lX, ly=None): + lX = self.vectorizer.transform(lX) + lZ = self.first_tier.transform(lX) + ly_ = self.meta.predict(lZ) + return ly_ + + def best_params(self): + return {'1st-tier':self.first_tier.best_params(), + 'meta':self.meta.best_params()} + diff --git a/src/main_multimodal_cls.py b/src/main_multimodal_cls.py index ee3a5f6..71de089 100644 --- a/src/main_multimodal_cls.py +++ b/src/main_multimodal_cls.py @@ -1,12 +1,16 @@ import os from dataset_builder import MultilingualDataset -from learning.learners import * +# from learning.learners import * +from learning.learners import FunnellingMultimodal +from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \ + TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder from util.evaluation import * from optparse import OptionParser from util.file import exists from util.results import PolylingualClassificationResults from sklearn.svm import SVC from util.util import get_learner, get_params +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV parser = OptionParser() @@ -53,7 +57,13 @@ parser.add_option("-l", dest="lang", type=str) def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, + + + # class_weight='balanced', + + + gamma='auto') def get_params(dense=False): @@ -88,7 +98,7 @@ if __name__ == '__main__': if op.set_c != -1: meta_parameters = None else: - meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}] + meta_parameters = [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] # Embeddings and WCE config _available_mode = ['none', 'unsupervised', 'supervised', 'both'] @@ -126,13 +136,22 @@ if __name__ == '__main__': result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') print(f'### PolyEmbedd_andrea_{_config_id}\n') - classifier = FunnellingMultimodal(we_path=op.we_path, - config=config, - first_tier_learner=get_learner(calibrate=True), - meta_learner=get_learner(calibrate=False, kernel='rbf'), - first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not? - meta_parameters=get_params(dense=True), - n_jobs=op.n_jobs) + # classifier = FunnellingMultimodal(we_path=op.we_path, + # config=config, + # first_tier_learner=get_learner(calibrate=True), + # meta_learner=get_learner(calibrate=False, kernel='rbf'), + # first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not? + # meta_parameters=get_params(dense=True), + # n_jobs=op.n_jobs) + + tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + post_prob = PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None) + wce_proj = WordClassEmbedder() + doc_embedder = DocEmbedderList(post_prob, wce_proj) + # doc_embedder = DocEmbedderList(post_prob) + meta = MetaClassifier(meta_learner=SVC(), meta_parameters=get_params(dense=True)) + classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) + print('# Fitting ...') classifier.fit(lXtr, lytr) @@ -145,8 +164,8 @@ if __name__ == '__main__': macrof1, microf1, macrok, microk = l_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], - (config['max_label_space'], classifier.best_components), - config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, - lang, macrof1, microf1, macrok, microk, '') + # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], + # (config['max_label_space'], classifier.best_components), + # config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, + # lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/transformers/StandardizeTransformer.py b/src/transformers/StandardizeTransformer.py index e776db7..d0902b5 100644 --- a/src/transformers/StandardizeTransformer.py +++ b/src/transformers/StandardizeTransformer.py @@ -15,9 +15,9 @@ class StandardizeTransformer: print('done\n') return self - def predict(self, X): + def transform(self, X): if not self.yetfit: 'transform called before fit' return (X - self.mean) / self.std - def fit_predict(self, X): - return self.fit(X).predict(X) + def fit_transform(self, X): + return self.fit(X).transform(X)