refactoring emebed method into Class StorageEmbeddings. refactoring class EmbeddingsAligned.

tSVD and T-SNE for supervised embeddings
This commit is contained in:
andrea 2019-12-03 15:34:12 +01:00
parent cf29826a32
commit 4de6b3e250
5 changed files with 170 additions and 82 deletions

View File

@ -11,7 +11,8 @@ from sklearn.svm import SVC
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format")
help="Path to the multilingual dataset processed and stored in .pickle format",
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv')
@ -23,7 +24,7 @@ parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the polylingual word embeddings", default='../embeddings/')
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
default='FastText')
default='MUSE')
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
@ -36,7 +37,7 @@ parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced')
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
def get_params(dense=False):
@ -64,6 +65,7 @@ if __name__ == '__main__':
data.show_dimensions()
# data.set_view(languages=['en','it'], categories=list(range(10)))
# data.set_view(languages=['en','it'])
lXtr, lytr = data.training()
lXte, lyte = data.test()
@ -100,6 +102,10 @@ if __name__ == '__main__':
'we_type': op.we_type}
_config_id = 'M_and_F'
##### TODO - config dict is redundant - we have already op argparse ...
config['reduction'] = 'tSVD'
config['max_label_space'] = 50
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
print(f'### PolyEmbedd_andrea_{_config_id}\n')
@ -114,7 +120,7 @@ if __name__ == '__main__':
print('# Fitting ...')
classifier.fit(lXtr, lytr)
print('# Evaluating ...')
print('\n# Evaluating ...')
l_eval = evaluate_method(classifier, lXte, lyte)
metrics = []

View File

@ -5,6 +5,7 @@ from torchtext.vocab import Vectors
import torch
from abc import ABC, abstractmethod
from data.supervised import get_supervised_embeddings
from sklearn.decomposition import PCA
class PretrainedEmbeddings(ABC):
@ -157,16 +158,41 @@ class FastTextWikiNews(Vectors):
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
# class EmbeddingsAligned(Vectors):
#
# def __init__(self, type, path, lang):
#
# self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
# # todo - rewrite as relative path
# self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
# self.path = path + self.name.format(lang)
# assert os.path.exists(path), f'pre-trained vectors not found in {path}'
# super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
# # self.vectors = self.extract(voc)
#
# def vocabulary(self):
# return set(self.stoi.keys())
#
# def dim(self):
# return self.dim
#
# def extract(self, words):
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
# extraction = torch.zeros((len(words), self.dim))
# extraction[source_idx] = self.vectors[target_idx]
# return extraction
class EmbeddingsAligned(Vectors):
def __init__(self, type, path, lang):
self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
def __init__(self, type, path, lang, voc):
# todo - rewrite as relative path
self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
self.path = path + self.name.format(lang)
assert os.path.exists(path), f'pre-trained vectors not found in {path}'
super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
self.vectors = self.extract(voc)
def vocabulary(self):
return set(self.stoi.keys())
@ -203,20 +229,69 @@ class FastTextMUSE(PretrainedEmbeddings):
return extraction
def embedding_matrix(type, path, voc, lang):
vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0])
class StorageEmbeddings:
def __init__(self, path):
self.path = path
self.lang_U = dict()
self.lang_S = dict()
print('[embedding matrix]')
print(f'# [pretrained-matrix: {type} {lang}]')
pretrained = EmbeddingsAligned(type, path, lang)
P = pretrained.extract(vocabulary).numpy()
del pretrained
print(f'[embedding matrix done] of shape={P.shape}\n')
def _add_embeddings_unsupervised(self, type, docs, vocs):
for lang in docs.keys():
print(f'# [unsupervised-matrix {type}] for {lang}')
voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
return
return vocabulary, P
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space):
for lang in docs.keys():
print(f'# [supervised-matrix] for {lang}')
# should also pass max_label_space and reduction techniques
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space)
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
return
def _concatenate_embeddings(self, docs):
_r = dict()
for lang in self.lang_U.keys():
_r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang])))
return _r
def fit(self, config, docs, vocs, labels):
if config['unsupervised']:
self._add_embeddings_unsupervised(config['we_type'], docs, vocs)
if config['supervised']:
self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'])
return self
def predict(self, config, docs):
if config['supervised'] and config['unsupervised']:
return self._concatenate_embeddings(docs)
elif config['supervised']:
_r = dict()
for lang in docs.keys():
_r[lang] = docs[lang].dot(self.lang_S[lang])
else:
_r = dict()
for lang in docs.keys():
_r[lang] = docs[lang].dot(self.lang_U[lang])
return _r
def WCE_matrix(Xtr, Ytr, lang):
# def embedding_matrix(type, path, voc, lang):
# vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x: x[1])))[0])
#
# print('[embedding matrix]')
# print(f'# [pretrained-matrix: {type} {lang}]')
# pretrained = EmbeddingsAligned(type, path, lang)
# P = pretrained.extract(vocabulary).numpy()
# del pretrained
# print(f'[embedding matrix done] of shape={P.shape}\n')
#
# return vocabulary, P
def WCE_matrix(Xtr, Ytr, lang, reduction=None, n_components=50):
print('\n# [supervised-matrix]')
S = get_supervised_embeddings(Xtr[lang], Ytr[lang])
print(f'[embedding matrix done] of shape={S.shape}\n')

View File

@ -1,6 +1,6 @@
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
# from util.common import *
from sklearn.decomposition import PCA
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
import numpy as np
@ -40,7 +40,7 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
return F
def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
print('computing supervised embeddings...')
nC = Y.shape[1]
@ -60,10 +60,21 @@ def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_probl
F = zscores(F, axis=0)
if nC > max_label_space:
if reduction == 'PCA':
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
f'Applying PCA(n_components={max_label_space})')
pca = PCA(n_components=max_label_space)
F = pca.fit(F).transform(F)
elif reduction == 'TSNE':
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
f'Applying t-SNE(n_components={max_label_space})')
tsne = TSNE(n_components=max_label_space)
F = tsne.fit(F).fit_transform(F)
elif reduction == 'tSVD':
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
f'Applying truncatedSVD(n_components={max_label_space})')
tSVD = TruncatedSVD(n_components=max_label_space)
F = tSVD.fit(F).fit_transform(F)
return F

View File

@ -1,6 +1,6 @@
import numpy as np
import time
from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix
from data.embeddings import WordEmbeddings, WCE_matrix, StorageEmbeddings
from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
@ -458,8 +458,9 @@ class AndreaCLF(FunnellingPolylingualClassifier):
self.lang_word2idx = dict()
self.languages = []
self.lang_tfidf = {}
self.word_embeddings = {}
self.supervised_embeddings = {}
# self.word_embeddings = {}
# self.supervised_embeddings = {}
self.embedding_space = None
self.model = None
self.time = None
@ -492,42 +493,42 @@ class AndreaCLF(FunnellingPolylingualClassifier):
return lZ, lYtr
def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
"""
build embedding matrix for given language and returns its weighted sum wrt tf-idf score
"""
_r = dict()
languages = list(lX.keys())
if prediction:
for lang in languages:
if unsupervised: # If unsupervised embeddings ...
M = self.word_embeddings[lang]
if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
S = self.supervised_embeddings[lang]
_r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
continue
_r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings
else: # If not unsupervised --> get (S) matrix and its weighted sum
S = self.supervised_embeddings[lang]
_r[lang] = lX[lang].dot(S)
return _r
if unsupervised:
for lang in languages:
_, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang)
self.word_embeddings[lang] = M
_r[lang] = lX[lang].dot(M)
if supervised:
for lang in languages:
S = WCE_matrix(lX, ly, lang)
self.supervised_embeddings[lang] = S
if unsupervised:
_r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
else:
_r[lang] = lX[lang].dot(S)
return _r
# def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
# """
# build embedding matrix for given language and returns its weighted sum wrt tf-idf score
# """
# _r = dict()
# languages = list(lX.keys())
#
# if prediction:
# for lang in languages:
# if unsupervised: # If unsupervised embeddings ...
# M = self.word_embeddings[lang]
# if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
# S = self.supervised_embeddings[lang]
# _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
# continue
# _r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings
# else: # If not unsupervised --> get (S) matrix and its weighted sum
# S = self.supervised_embeddings[lang]
# _r[lang] = lX[lang].dot(S)
# return _r
#
# if unsupervised:
# for lang in languages:
# _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang)
# self.word_embeddings[lang] = M
# _r[lang] = lX[lang].dot(M)
#
# if supervised:
# for lang in languages:
# S = WCE_matrix(lX, ly, lang)
# self.supervised_embeddings[lang] = S
# if unsupervised:
# _r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
# else:
# _r[lang] = lX[lang].dot(S)
# return _r
# @override std class method
def fit(self, lX, ly):
@ -541,17 +542,11 @@ class AndreaCLF(FunnellingPolylingualClassifier):
Z, zy = self._get_zspace(lX, ly)
if self.config['supervised'] or self.config['unsupervised']:
# Z vectors is concatenated with doc's embedding weighted sum
Z_embedded = dict()
l_weighted_em = self.embed(lX, ly,
unsupervised=self.config['unsupervised'],
supervised=self.config['supervised'])
# stacking Z space horizontally with unsupervised (M) and/or supervised (F) embeddings
for lang in list(lX.keys()):
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
Z = Z_embedded
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
_embedding_space = self.embedding_space.predict(self.config, lX)
# h_stacking posterior probabilities with (U) and/or (S) matrices
for lang in self.languages:
Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
# stacking Z space vertically
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
@ -573,14 +568,15 @@ class AndreaCLF(FunnellingPolylingualClassifier):
lZ = self._projection(self.doc_projector, lX)
if self.config['supervised'] or self.config['unsupervised']:
l_weighted_em = self.embed(lX, ly,
unsupervised=self.config['unsupervised'],
supervised=self.config['supervised'],
prediction=True)
Z_embedded = dict()
_embedding_space = self.embedding_space.predict(self.config, lX)
# l_weighted_em = self.embed(lX, ly,
# unsupervised=self.config['unsupervised'],
# supervised=self.config['supervised'],
# prediction=True)
# Z_embedded = dict()
for lang in lX.keys():
Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang]))
lZ = Z_embedded
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
# lZ = Z_embedded
for lang in lZ.keys():
print(lZ[lang].shape)

View File

@ -12,7 +12,7 @@ class StandardizeTransformer:
self.std = np.clip(std, 1e-5, None)
self.mean = np.mean(X, axis=self.axis)
self.yetfit=True
print('done')
print('done\n')
return self
def predict(self, X):