import os import pickle from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod from data.supervised import get_supervised_embeddings from util.decompositions import * class PretrainedEmbeddings(ABC): def __init__(self): super().__init__() @abstractmethod def vocabulary(self): pass @abstractmethod def dim(self): pass @classmethod def reindex(cls, words, word2index): source_idx, target_idx = [], [] for i, word in enumerate(words): if word not in word2index: continue j = word2index[word] source_idx.append(i) target_idx.append(j) source_idx = np.asarray(source_idx) target_idx = np.asarray(target_idx) return source_idx, target_idx class WordEmbeddings: def __init__(self, lang, we, worddim): self.lang = lang self.we = we self.worddim = worddim self.dimword = {v:k for k,v in self.worddim.items()} @classmethod def load(cls, basedir, lang, word_preprocessor=None, dopickle=True): filename = 'wiki.multi.{}.vec'.format(lang) we_path = os.path.join(basedir, filename) if dopickle and os.path.exists(we_path + '.pkl'): print('loading pkl in {}'.format(we_path + '.pkl')) (worddim, we) = pickle.load(open(we_path + '.pkl', 'rb')) else: word_registry = set() lines = open(we_path).readlines() nwords, dims = [int(x) for x in lines[0].split()] print('reading we of {} dimensions'.format(dims)) we = np.zeros((nwords, dims), dtype=float) worddim = {} index = 0 for i, line in enumerate(lines[1:]): if (i + 1) % 100 == 0: print('\r{}/{}'.format(i + 1, len(lines)), end='') word, *vals = line.split() wordp = word_preprocessor(word) if word_preprocessor is not None else word if wordp: wordp = wordp[0] if wordp in word_registry: print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp)) elif len(vals) == dims: worddim[wordp] = index we[index, :] = np.array(vals).astype(float) index += 1 # else: # print('warning: word <{}> generates an empty string after preprocessing'.format(word)) we = we[:index] print('load {} words'.format(index)) if dopickle: print('saving...') pickle.dump((worddim, we), open(we_path + '.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) return WordEmbeddings(lang, we, worddim) def vocabulary(self): return set(self.worddim.keys()) def __getitem__(self, key): return self.we[self.worddim[key]] def dim(self): return self.we.shape[1] def __contains__(self, key): return key in self.worddim def most_similar(self, word_vect, k): if word_vect.ndim == 1: word_vect = word_vect.reshape(1,-1) assert word_vect.shape[1] == self.dim(), 'inconsistent dimensions' sim = np.dot(word_vect,self.we.T) order = np.argsort(-1*sim, axis=1)[:,:k] similar_words = [[self.dimword[order[vi,ki]] for ki in range(k)] for vi in range(word_vect.shape[0])] sim_scores = sim[:,order] return similar_words, sim_scores def get_vectors(self, wordlist): indexes = np.array([self.worddim[w] for w in wordlist]) return self.we[indexes] def restrict(self, vocabulary): # vocabulary is a set of terms to be kept active_vocabulary = sorted([w for w in vocabulary if w in self.worddim]) lost = len(vocabulary)-len(active_vocabulary) if lost > 0: # some terms are missing, so it will be replaced by UNK print('warning: missing {} terms for lang {}'.format(lost, self.lang)) self.we = self.get_vectors(active_vocabulary) assert self.we.shape[0] == len(active_vocabulary) self.dimword={i:w for i,w in enumerate(active_vocabulary)} self.worddim={w:i for i,w in enumerate(active_vocabulary)} return self @classmethod def load_poly(cls, basedir, langs, lang_vocabularies=None, word_preprocessor=None): if lang_vocabularies is None: return cls.merge([cls.load(basedir,lang, word_preprocessor) for lang in langs]) else: # assert all([l in lang_vocabularies for l in langs]), 'missing vocabulary for some languages' return cls.merge([cls.load(basedir, lang, word_preprocessor).restrict(lang_vocabularies[lang]) for lang in langs]) @classmethod def merge(cls, we_list): assert all([isinstance(we, WordEmbeddings) for we in we_list]), \ 'instances of {} expected'.format(WordEmbeddings.__name__) polywe = [] worddim = {} offset = 0 for we in we_list: polywe.append(we.we) worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()}) offset = len(worddim) polywe = np.vstack(polywe) return WordEmbeddings(lang='poly', we=polywe, worddim=worddim) class FastTextWikiNews(Vectors): url_base = 'Cant auto-download MUSE embeddings' path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec' _name = '/embeddings/wiki.multi.{}.vec' def __init__(self, cache, language="en", **kwargs): url = self.url_base.format(language) name = cache + self._name.format(language) super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) class EmbeddingsAligned(Vectors): def __init__(self, type, path, lang, voc): # todo - rewrite as relative path self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT' self.path = path + self.name.format(lang) assert os.path.exists(path), f'pre-trained vectors not found in {path}' super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path) self.vectors = self.extract(voc) def vocabulary(self): return set(self.stoi.keys()) def extract(self, words): source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi) extraction = torch.zeros((len(words), self.dim)) extraction[source_idx] = self.vectors[target_idx] return extraction def reduce(self, dim): pca = PCA(n_components=dim) self.vectors = pca.fit_transform(self.vectors) return class FastTextMUSE(PretrainedEmbeddings): def __init__(self, path, lang, limit=None): super().__init__() print(f'Loading fastText pretrained vectors from {path}') assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') self.embed = FastTextWikiNews(path, lang, max_vectors=limit) def vocabulary(self): return set(self.embed.stoi.keys()) def dim(self): return self.embed.dim def extract(self, words): source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) extraction = torch.zeros((len(words), self.dim())) extraction[source_idx] = self.embed.vectors[target_idx] return extraction class StorageEmbeddings: def __init__(self, path): self.path = path self.lang_U = dict() self.lang_S = dict() def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300): for lang in docs.keys(): print(f'# [unsupervised-matrix {type}] for {lang}') voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0]) self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n') nC = self.lang_U[lang].shape[1] if max_label_space == 0: print(f'Computing optimal number of PCA components along matrices U') optimal_n = get_optimal_dim(self.lang_U, 'U') self.lang_U = run_pca(optimal_n, self.lang_U) elif max_label_space < nC: self.lang_U = run_pca(max_label_space, self.lang_U) return def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc): for lang in docs.keys(): # compute supervised matrices S - then apply PCA print(f'# [supervised-matrix] for {lang}') self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang) nC = self.lang_S[lang].shape[1] print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') if max_label_space == 0: print(f'Computing optimal number of PCA components along matrices S') optimal_n = get_optimal_dim(self.lang_S, 'S') print(f'Applying PCA(n_components={optimal_n})') self.lang_S = run_pca(optimal_n, self.lang_S) elif max_label_space == -1: print(f'Computing PCA on vertical stacked WCE embeddings') languages = self.lang_S.keys() _temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) stacked_pca = PCA(n_components=_temp_stack.shape[1]) stacked_pca.fit(_temp_stack) best_n = None _r = stacked_pca.explained_variance_ratio_ _r = np.cumsum(_r) plt.plot(_r, label='Stacked Supervised') for i in range(len(_r) - 1, 1, -1): delta = _r[i] - _r[i - 1] if delta > 0: best_n = i break plt.show() stacked_pca = PCA(n_components=best_n) stacked_pca.fit(_temp_stack) print(f'Applying PCA(n_components={i}') for lang in languages: self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang]) elif max_label_space < nC: self.lang_S = run_pca(max_label_space, self.lang_S) return def _concatenate_embeddings(self, docs): _r = dict() for lang in self.lang_U.keys(): _r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang]))) return _r def fit(self, config, docs, vocs, labels): if config['unsupervised']: self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised']) if config['supervised']: self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs) return self def predict(self, config, docs): if config['supervised'] and config['unsupervised']: return self._concatenate_embeddings(docs) elif config['supervised']: _r = dict() for lang in docs.keys(): _r[lang] = docs[lang].dot(self.lang_S[lang]) else: _r = dict() for lang in docs.keys(): _r[lang] = docs[lang].dot(self.lang_U[lang]) return _r