291 lines
11 KiB
Python
291 lines
11 KiB
Python
import os
|
|
import pickle
|
|
from torchtext.vocab import Vectors
|
|
import torch
|
|
from abc import ABC, abstractmethod
|
|
from data.supervised import get_supervised_embeddings
|
|
from util.decompositions import *
|
|
|
|
|
|
class PretrainedEmbeddings(ABC):
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
@abstractmethod
|
|
def vocabulary(self): pass
|
|
|
|
@abstractmethod
|
|
def dim(self): pass
|
|
|
|
@classmethod
|
|
def reindex(cls, words, word2index):
|
|
source_idx, target_idx = [], []
|
|
for i, word in enumerate(words):
|
|
if word not in word2index: continue
|
|
j = word2index[word]
|
|
source_idx.append(i)
|
|
target_idx.append(j)
|
|
source_idx = np.asarray(source_idx)
|
|
target_idx = np.asarray(target_idx)
|
|
return source_idx, target_idx
|
|
|
|
|
|
class WordEmbeddings:
|
|
|
|
def __init__(self, lang, we, worddim):
|
|
self.lang = lang
|
|
self.we = we
|
|
self.worddim = worddim
|
|
self.dimword = {v:k for k,v in self.worddim.items()}
|
|
|
|
@classmethod
|
|
def load(cls, basedir, lang, word_preprocessor=None, dopickle=True):
|
|
filename = 'wiki.multi.{}.vec'.format(lang)
|
|
we_path = os.path.join(basedir, filename)
|
|
|
|
if dopickle and os.path.exists(we_path + '.pkl'):
|
|
print('loading pkl in {}'.format(we_path + '.pkl'))
|
|
(worddim, we) = pickle.load(open(we_path + '.pkl', 'rb'))
|
|
else:
|
|
word_registry = set()
|
|
lines = open(we_path).readlines()
|
|
nwords, dims = [int(x) for x in lines[0].split()]
|
|
print('reading we of {} dimensions'.format(dims))
|
|
we = np.zeros((nwords, dims), dtype=float)
|
|
worddim = {}
|
|
index = 0
|
|
for i, line in enumerate(lines[1:]):
|
|
if (i + 1) % 100 == 0:
|
|
print('\r{}/{}'.format(i + 1, len(lines)), end='')
|
|
word, *vals = line.split()
|
|
wordp = word_preprocessor(word) if word_preprocessor is not None else word
|
|
if wordp:
|
|
wordp = wordp[0]
|
|
if wordp in word_registry:
|
|
print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp))
|
|
elif len(vals) == dims:
|
|
worddim[wordp] = index
|
|
we[index, :] = np.array(vals).astype(float)
|
|
index += 1
|
|
# else:
|
|
# print('warning: word <{}> generates an empty string after preprocessing'.format(word))
|
|
we = we[:index]
|
|
print('load {} words'.format(index))
|
|
if dopickle:
|
|
print('saving...')
|
|
pickle.dump((worddim, we), open(we_path + '.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
|
|
|
return WordEmbeddings(lang, we, worddim)
|
|
|
|
def vocabulary(self):
|
|
return set(self.worddim.keys())
|
|
|
|
def __getitem__(self, key):
|
|
return self.we[self.worddim[key]]
|
|
|
|
def dim(self):
|
|
return self.we.shape[1]
|
|
|
|
def __contains__(self, key):
|
|
return key in self.worddim
|
|
|
|
def most_similar(self, word_vect, k):
|
|
if word_vect.ndim == 1:
|
|
word_vect = word_vect.reshape(1,-1)
|
|
assert word_vect.shape[1] == self.dim(), 'inconsistent dimensions'
|
|
|
|
sim = np.dot(word_vect,self.we.T)
|
|
order = np.argsort(-1*sim, axis=1)[:,:k]
|
|
|
|
similar_words = [[self.dimword[order[vi,ki]] for ki in range(k)] for vi in range(word_vect.shape[0])]
|
|
sim_scores = sim[:,order]
|
|
return similar_words, sim_scores
|
|
|
|
def get_vectors(self, wordlist):
|
|
indexes = np.array([self.worddim[w] for w in wordlist])
|
|
return self.we[indexes]
|
|
|
|
def restrict(self, vocabulary):
|
|
# vocabulary is a set of terms to be kept
|
|
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
|
|
lost = len(vocabulary)-len(active_vocabulary)
|
|
if lost > 0: # some terms are missing, so it will be replaced by UNK
|
|
print('warning: missing {} terms for lang {}'.format(lost, self.lang))
|
|
self.we = self.get_vectors(active_vocabulary)
|
|
assert self.we.shape[0] == len(active_vocabulary)
|
|
self.dimword={i:w for i,w in enumerate(active_vocabulary)}
|
|
self.worddim={w:i for i,w in enumerate(active_vocabulary)}
|
|
return self
|
|
|
|
@classmethod
|
|
def load_poly(cls, basedir, langs, lang_vocabularies=None, word_preprocessor=None):
|
|
if lang_vocabularies is None:
|
|
return cls.merge([cls.load(basedir,lang, word_preprocessor) for lang in langs])
|
|
else:
|
|
# assert all([l in lang_vocabularies for l in langs]), 'missing vocabulary for some languages'
|
|
return cls.merge([cls.load(basedir, lang, word_preprocessor).restrict(lang_vocabularies[lang]) for lang in langs])
|
|
|
|
@classmethod
|
|
def merge(cls, we_list):
|
|
assert all([isinstance(we, WordEmbeddings) for we in we_list]), \
|
|
'instances of {} expected'.format(WordEmbeddings.__name__)
|
|
|
|
polywe = []
|
|
worddim = {}
|
|
offset = 0
|
|
for we in we_list:
|
|
polywe.append(we.we)
|
|
worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
|
|
offset = len(worddim)
|
|
polywe = np.vstack(polywe)
|
|
|
|
return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
|
|
|
|
|
|
class FastTextWikiNews(Vectors):
|
|
|
|
url_base = 'Cant auto-download MUSE embeddings'
|
|
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
|
|
_name = '/embeddings/wiki.multi.{}.vec'
|
|
|
|
def __init__(self, cache, language="en", **kwargs):
|
|
url = self.url_base.format(language)
|
|
name = cache + self._name.format(language)
|
|
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
|
|
|
|
|
class EmbeddingsAligned(Vectors):
|
|
|
|
def __init__(self, type, path, lang, voc):
|
|
# todo - rewrite as relative path
|
|
self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
|
|
self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
|
|
self.path = path + self.name.format(lang)
|
|
assert os.path.exists(path), f'pre-trained vectors not found in {path}'
|
|
super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
|
|
self.vectors = self.extract(voc)
|
|
|
|
def vocabulary(self):
|
|
return set(self.stoi.keys())
|
|
|
|
def extract(self, words):
|
|
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
|
|
extraction = torch.zeros((len(words), self.dim))
|
|
extraction[source_idx] = self.vectors[target_idx]
|
|
return extraction
|
|
|
|
def reduce(self, dim):
|
|
pca = PCA(n_components=dim)
|
|
self.vectors = pca.fit_transform(self.vectors)
|
|
return
|
|
|
|
|
|
class FastTextMUSE(PretrainedEmbeddings):
|
|
|
|
def __init__(self, path, lang, limit=None):
|
|
super().__init__()
|
|
print(f'Loading fastText pretrained vectors from {path}')
|
|
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
|
|
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
|
|
|
|
def vocabulary(self):
|
|
return set(self.embed.stoi.keys())
|
|
|
|
def dim(self):
|
|
return self.embed.dim
|
|
|
|
def extract(self, words):
|
|
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
|
extraction = torch.zeros((len(words), self.dim()))
|
|
extraction[source_idx] = self.embed.vectors[target_idx]
|
|
return extraction
|
|
|
|
|
|
class StorageEmbeddings:
|
|
def __init__(self, path):
|
|
self.path = path
|
|
self.lang_U = dict()
|
|
self.lang_S = dict()
|
|
|
|
def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
|
|
for lang in docs.keys():
|
|
print(f'# [unsupervised-matrix {type}] for {lang}')
|
|
voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
|
|
self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
|
|
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
|
|
nC = self.lang_U[lang].shape[1]
|
|
if max_label_space == 0:
|
|
print(f'Computing optimal number of PCA components along matrices U')
|
|
optimal_n = get_optimal_dim(self.lang_U, 'U')
|
|
self.lang_U = run_pca(optimal_n, self.lang_U)
|
|
elif max_label_space < nC:
|
|
self.lang_U = run_pca(max_label_space, self.lang_U)
|
|
|
|
return
|
|
|
|
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
|
|
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
|
|
print(f'# [supervised-matrix] for {lang}')
|
|
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
|
|
reduction, max_label_space, voc[lang], lang)
|
|
nC = self.lang_S[lang].shape[1]
|
|
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
|
|
|
if max_label_space == 0:
|
|
print(f'Computing optimal number of PCA components along matrices S')
|
|
optimal_n = get_optimal_dim(self.lang_S, 'S')
|
|
print(f'Applying PCA(n_components={optimal_n})')
|
|
self.lang_S = run_pca(optimal_n, self.lang_S)
|
|
elif max_label_space == -1:
|
|
print(f'Computing PCA on vertical stacked WCE embeddings')
|
|
languages = self.lang_S.keys()
|
|
_temp_stack = np.vstack([self.lang_S[lang] for lang in languages])
|
|
stacked_pca = PCA(n_components=_temp_stack.shape[1])
|
|
stacked_pca.fit(_temp_stack)
|
|
best_n = None
|
|
_r = stacked_pca.explained_variance_ratio_
|
|
_r = np.cumsum(_r)
|
|
plt.plot(_r, label='Stacked Supervised')
|
|
for i in range(len(_r) - 1, 1, -1):
|
|
delta = _r[i] - _r[i - 1]
|
|
if delta > 0:
|
|
best_n = i
|
|
break
|
|
plt.show()
|
|
stacked_pca = PCA(n_components=best_n)
|
|
stacked_pca.fit(_temp_stack)
|
|
print(f'Applying PCA(n_components={i}')
|
|
for lang in languages:
|
|
self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
|
|
elif max_label_space < nC:
|
|
self.lang_S = run_pca(max_label_space, self.lang_S)
|
|
|
|
return
|
|
|
|
def _concatenate_embeddings(self, docs):
|
|
_r = dict()
|
|
for lang in self.lang_U.keys():
|
|
_r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang])))
|
|
return _r
|
|
|
|
def fit(self, config, docs, vocs, labels):
|
|
if config['unsupervised']:
|
|
self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
|
|
if config['supervised']:
|
|
self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
|
|
return self
|
|
|
|
def predict(self, config, docs):
|
|
if config['supervised'] and config['unsupervised']:
|
|
return self._concatenate_embeddings(docs)
|
|
elif config['supervised']:
|
|
_r = dict()
|
|
for lang in docs.keys():
|
|
_r[lang] = docs[lang].dot(self.lang_S[lang])
|
|
else:
|
|
_r = dict()
|
|
for lang in docs.keys():
|
|
_r[lang] = docs[lang].dot(self.lang_U[lang])
|
|
return _r
|