From c14e8226b101cad62853879cabe96b0e0138bb35 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Mon, 20 Jan 2020 12:32:47 +0100 Subject: [PATCH] refactor: added MUSE to learning/transformers.py --- src/embeddings/embeddings.py | 6 +- src/embeddings/pretrained.py | 103 ++++++++++++++++++++++++++++++++ src/learning/transformers.py | 50 +++++++++++++--- src/main_multimodal_cls.py | 112 +++++++++++------------------------ 4 files changed, 182 insertions(+), 89 deletions(-) create mode 100644 src/embeddings/pretrained.py diff --git a/src/embeddings/embeddings.py b/src/embeddings/embeddings.py index 0ca51fc..49ea7a0 100644 --- a/src/embeddings/embeddings.py +++ b/src/embeddings/embeddings.py @@ -148,7 +148,7 @@ class FastTextWikiNews(Vectors): url_base = 'Cant auto-download MUSE embeddings' path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec' - _name = '/embeddings/wiki.multi.{}.vec' + _name = '/wiki.multi.{}.vec' def __init__(self, cache, language="en", **kwargs): url = self.url_base.format(language) @@ -156,6 +156,7 @@ class FastTextWikiNews(Vectors): super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) + class EmbeddingsAligned(Vectors): def __init__(self, type, path, lang, voc): @@ -186,10 +187,11 @@ class FastTextMUSE(PretrainedEmbeddings): def __init__(self, path, lang, limit=None): super().__init__() - print(f'Loading fastText pretrained vectors from {path}') + print(f'Loading fastText pretrained vectors for language {lang} from {path}') assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') self.embed = FastTextWikiNews(path, lang, max_vectors=limit) + def vocabulary(self): return set(self.embed.stoi.keys()) diff --git a/src/embeddings/pretrained.py b/src/embeddings/pretrained.py new file mode 100644 index 0000000..def5be0 --- /dev/null +++ b/src/embeddings/pretrained.py @@ -0,0 +1,103 @@ +from abc import ABC, abstractmethod +import torch, torchtext +import gensim +import os +import numpy as np + + +class KeyedVectors: + + def __init__(self, word2index, weights): + assert len(word2index)==weights.shape[0], 'wrong number of dimensions' + index2word = {i:w for w,i in word2index.items()} + assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed' + self.word2index = word2index + self.index2word = index2word + self.weights = weights + + def extract(self, words): + dim = self.weights.shape[1] + v_size = len(words) + + source_idx, target_idx = [], [] + for i,word in enumerate(words): + if word not in self.word2index: continue + j = self.word2index[word] + source_idx.append(i) + target_idx.append(j) + + extraction = np.zeros((v_size, dim)) + extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)] + + return extraction + + + +class PretrainedEmbeddings(ABC): + + def __init__(self): + super().__init__() + + @abstractmethod + def vocabulary(self): pass + + @abstractmethod + def dim(self): pass + + @classmethod + def reindex(cls, words, word2index): + source_idx, target_idx = [], [] + for i, word in enumerate(words): + if word not in word2index: continue + j = word2index[word] + source_idx.append(i) + target_idx.append(j) + source_idx = np.asarray(source_idx) + target_idx = np.asarray(target_idx) + return source_idx, target_idx + + +class GloVe(PretrainedEmbeddings): + + def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None): + super().__init__() + print(f'Loading GloVe pretrained vectors from torchtext') + self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors) + print('Done') + + def vocabulary(self): + return set(self.embed.stoi.keys()) + + def dim(self): + return self.embed.dim + + def extract(self, words): + source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) + extraction = torch.zeros((len(words), self.dim())) + extraction[source_idx] = self.embed.vectors[target_idx] + return extraction + + +class Word2Vec(PretrainedEmbeddings): + + def __init__(self, path, limit=None): + super().__init__() + print(f'Loading word2vec pretrained vectors from {path}') + assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}') + self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit) + self.word2index={w:i for i,w in enumerate(self.embed.index2word)} + print('Done') + + def vocabulary(self): + return set(self.word2index.keys()) + + def dim(self): + return self.embed.vector_size + + def extract(self, words): + source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index) + extraction = np.zeros((len(words), self.dim())) + extraction[source_idx] = self.embed.vectors[target_idx] + extraction = torch.from_numpy(extraction).float() + return extraction + diff --git a/src/learning/transformers.py b/src/learning/transformers.py index cf21585..72f19f0 100644 --- a/src/learning/transformers.py +++ b/src/learning/transformers.py @@ -1,6 +1,7 @@ import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer #from data.text_preprocessor import NLTKStemTokenizer +from embeddings.embeddings import FastTextMUSE from embeddings.supervised import supervised_embeddings_tfidf, zscores from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling import time @@ -53,7 +54,7 @@ class PosteriorProbabilitiesEmbedder: self.fist_tier_parameters, n_jobs=n_jobs) - def fit(self, lX, lY): + def fit(self, lX, lY, lV=None): print('fitting the projectors... {}'.format(lX.keys())) self.doc_projector.fit(lX, lY) return self @@ -63,20 +64,45 @@ class PosteriorProbabilitiesEmbedder: lZ = self.doc_projector.predict_proba(lX) return lZ - def fit_transform(self, lX, ly=None): + def fit_transform(self, lX, ly=None, lV=None): return self.fit(lX, ly).transform(lX) def best_params(self): return self.doc_projector.best_params() +class MuseEmbedder: + + def __init__(self, path, n_jobs=-1): + self.path=path + self.n_jobs = n_jobs + + def fit(self, lX, ly, lV): + self.langs = sorted(lX.keys()) + MUSE = Parallel(n_jobs=self.n_jobs)( + delayed(FastTextMUSE)(self.path, lang) for lang in self.langs + ) + self.MUSE = {l:MUSE[i].extract(lV[l]).numpy() for i,l in enumerate(self.langs)} + return self + + def transform(self, lX): + MUSE = self.MUSE + XdotMUSE = Parallel(n_jobs=self.n_jobs)( + delayed(XdotM)(lX[lang], MUSE[lang]) for lang in self.langs + ) + return {l: XdotMUSE[i] for i, l in enumerate(self.langs)} + + def fit_transform(self, lX, ly, lV): + return self.fit(lX, ly, lV).transform(lX) + + class WordClassEmbedder: def __init__(self, n_jobs=-1, max_label_space=300): self.n_jobs = n_jobs self.max_label_space=max_label_space - def fit(self, lX, ly): + def fit(self, lX, ly, lV=None): self.langs = sorted(lX.keys()) WCE = Parallel(n_jobs=self.n_jobs)( delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs @@ -91,7 +117,7 @@ class WordClassEmbedder: ) return {l: XdotWCE[i] for i, l in enumerate(self.langs)} - def fit_transform(self, lX, ly): + def fit_transform(self, lX, ly, lV=None): return self.fit(lX, ly).transform(lX) @@ -119,11 +145,13 @@ def XdotM(X,M): class DocEmbedderList: def __init__(self, *embedder_list): + if len(embedder_list)==0: embedder_list=[] self.embedders = embedder_list - def fit(self, lX, ly): + + def fit(self, lX, ly, lV): for transformer in self.embedders: - transformer.fit(lX,ly) + transformer.fit(lX,ly,lV) return self def transform(self, lX): @@ -145,12 +173,15 @@ class DocEmbedderList: return {l:hstacker(lZparts[l]) for l in langs} - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) + def fit_transform(self, lX, ly, lV): + return self.fit(lX, ly, lV).transform(lX) def best_params(self): return {'todo'} + def append(self, embedder): + self.embedders.append(embedder) + # ------------------------------------------------------------------ # Meta-Classifier # ------------------------------------------------------------------ @@ -200,7 +231,8 @@ class Funnelling: def fit(self, lX, ly): lX = self.vectorizer.fit_transform(lX, ly) - lZ = self.first_tier.fit_transform(lX, ly) + lV = self.vectorizer.vocabulary() + lZ = self.first_tier.fit_transform(lX, ly, lV) self.meta.fit(lZ, ly) def predict(self, lX, ly=None): diff --git a/src/main_multimodal_cls.py b/src/main_multimodal_cls.py index 71de089..f65a442 100644 --- a/src/main_multimodal_cls.py +++ b/src/main_multimodal_cls.py @@ -3,7 +3,7 @@ from dataset_builder import MultilingualDataset # from learning.learners import * from learning.learners import FunnellingMultimodal from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \ - TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder + TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder from util.evaluation import * from optparse import OptionParser from util.file import exists @@ -21,14 +21,17 @@ parser.add_option("-d", "--dataset", dest="dataset", parser.add_option("-o", "--output", dest="output", help="Result file", type=str, default='./results/results.csv') -parser.add_option("-e", "--mode-embed", dest="mode_embed", - help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none') +parser.add_option("-P", "--probs", dest="probs", action='store_true', + help="Add posterior probabilities to the document embedding representation", default=False) + +parser.add_option("-S", "--supervised", dest="supervised", action='store_true', + help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False) + +parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true', + help="Add pretrained MUSE embeddings to the document embedding representation", default=False) parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/') - -parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str, - default='MUSE') + help="Path to the MUSE polylingual word embeddings", default='../embeddings') parser.add_option("-s", "--set_c", dest="set_c",type=float, help="Set the C parameter", default=1) @@ -40,16 +43,12 @@ parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, help="Number of parallel jobs (default is -1, all)", default=-1) parser.add_option("-p", "--pca", dest="max_labels_S", type=int, - help="If smaller than number of target classes, PCA will be applied to supervised matrix. " - "If set to 0 it will automatically search for the best number of components. " - "If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)", + help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", default=300) -parser.add_option("-u", "--upca", dest="max_labels_U", type=int, - help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." - " If set to 0 it will automatically search for the best number of components", default=300) - -parser.add_option("-l", dest="lang", type=str) +# parser.add_option("-u", "--upca", dest="max_labels_U", type=int, +# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." +# " If set to 0 it will automatically search for the best number of components", default=300) # parser.add_option("-a", dest="post_pca", # help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " @@ -57,13 +56,7 @@ parser.add_option("-l", dest="lang", type=str) def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, - - - # class_weight='balanced', - - - gamma='auto') + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto') def get_params(dense=False): @@ -89,69 +82,32 @@ if __name__ == '__main__': data = MultilingualDataset.load(op.dataset) data.show_dimensions() - # data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10))) - # data.set_view(languages=[op.lang]) - # data.set_view(categories=list(range(10))) lXtr, lytr = data.training() lXte, lyte = data.test() - if op.set_c != -1: - meta_parameters = None - else: - meta_parameters = [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] + meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] - # Embeddings and WCE config - _available_mode = ['none', 'unsupervised', 'supervised', 'both'] - _available_type = ['MUSE', 'FastText'] - assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}' - assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}' + result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}' - if op.mode_embed == 'none': - config = {'unsupervised': False, - 'supervised': False, - 'we_type': None} - _config_id = 'None' - elif op.mode_embed == 'unsupervised': - config = {'unsupervised': True, - 'supervised': False, - 'we_type': op.we_type} - _config_id = 'M' - elif op.mode_embed == 'supervised': - config = {'unsupervised': False, - 'supervised': True, - 'we_type': None} - _config_id = 'F' - elif op.mode_embed == 'both': - config = {'unsupervised': True, - 'supervised': True, - 'we_type': op.we_type} - _config_id = 'M+F' - - config['reduction'] = 'PCA' - config['max_label_space'] = op.max_labels_S - config['dim_reduction_unsupervised'] = op.max_labels_U - # config['post_pca'] = op.post_pca - # config['plot_covariance_matrices'] = True - - result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') - - print(f'### PolyEmbedd_andrea_{_config_id}\n') - # classifier = FunnellingMultimodal(we_path=op.we_path, - # config=config, - # first_tier_learner=get_learner(calibrate=True), - # meta_learner=get_learner(calibrate=False, kernel='rbf'), - # first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not? - # meta_parameters=get_params(dense=True), - # n_jobs=op.n_jobs) + print(f'{result_id}') + # text preprocessing tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - post_prob = PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None) - wce_proj = WordClassEmbedder() - doc_embedder = DocEmbedderList(post_prob, wce_proj) - # doc_embedder = DocEmbedderList(post_prob) - meta = MetaClassifier(meta_learner=SVC(), meta_parameters=get_params(dense=True)) - classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) + # document embedding modules + doc_embedder = DocEmbedderList() + if op.probs: + doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None)) + if op.supervised: + doc_embedder.append(WordClassEmbedder(max_label_space=op.max_labels_S)) + if op.pretrained: + doc_embedder.append(MuseEmbedder(op.we_path)) + + # metaclassifier + meta = MetaClassifier(meta_learner=SVC(), meta_parameters=get_params(dense=True)) + + # ensembling the modules + classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) print('# Fitting ...') classifier.fit(lXtr, lytr) @@ -163,7 +119,7 @@ if __name__ == '__main__': for lang in lXte.keys(): macrof1, microf1, macrok, microk = l_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) - print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) + print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], # (config['max_label_space'], classifier.best_components), # config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,