refactoring emebed method into Class StorageEmbeddings. refactoring class EmbeddingsAligned.
tSVD and T-SNE for supervised embeddings
This commit is contained in:
parent
cf29826a32
commit
4de6b3e250
|
|
@ -11,7 +11,8 @@ from sklearn.svm import SVC
|
|||
parser = OptionParser()
|
||||
|
||||
parser.add_option("-d", "--dataset", dest="dataset",
|
||||
help="Path to the multilingual dataset processed and stored in .pickle format")
|
||||
help="Path to the multilingual dataset processed and stored in .pickle format",
|
||||
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='./results/results.csv')
|
||||
|
|
@ -23,7 +24,7 @@ parser.add_option("-w", "--we-path", dest="we_path",
|
|||
help="Path to the polylingual word embeddings", default='../embeddings/')
|
||||
|
||||
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
|
||||
default='FastText')
|
||||
default='MUSE')
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
|
@ -36,7 +37,7 @@ parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
|||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced')
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
|
||||
|
||||
|
||||
def get_params(dense=False):
|
||||
|
|
@ -64,6 +65,7 @@ if __name__ == '__main__':
|
|||
data.show_dimensions()
|
||||
|
||||
# data.set_view(languages=['en','it'], categories=list(range(10)))
|
||||
# data.set_view(languages=['en','it'])
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
|
|
@ -100,6 +102,10 @@ if __name__ == '__main__':
|
|||
'we_type': op.we_type}
|
||||
_config_id = 'M_and_F'
|
||||
|
||||
##### TODO - config dict is redundant - we have already op argparse ...
|
||||
config['reduction'] = 'tSVD'
|
||||
config['max_label_space'] = 50
|
||||
|
||||
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
||||
|
||||
print(f'### PolyEmbedd_andrea_{_config_id}\n')
|
||||
|
|
@ -114,7 +120,7 @@ if __name__ == '__main__':
|
|||
print('# Fitting ...')
|
||||
classifier.fit(lXtr, lytr)
|
||||
|
||||
print('# Evaluating ...')
|
||||
print('\n# Evaluating ...')
|
||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||
|
||||
metrics = []
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ from torchtext.vocab import Vectors
|
|||
import torch
|
||||
from abc import ABC, abstractmethod
|
||||
from data.supervised import get_supervised_embeddings
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
|
||||
class PretrainedEmbeddings(ABC):
|
||||
|
|
@ -157,16 +158,41 @@ class FastTextWikiNews(Vectors):
|
|||
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
||||
|
||||
|
||||
# class EmbeddingsAligned(Vectors):
|
||||
#
|
||||
# def __init__(self, type, path, lang):
|
||||
#
|
||||
# self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
|
||||
# # todo - rewrite as relative path
|
||||
# self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
|
||||
# self.path = path + self.name.format(lang)
|
||||
# assert os.path.exists(path), f'pre-trained vectors not found in {path}'
|
||||
# super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
|
||||
# # self.vectors = self.extract(voc)
|
||||
#
|
||||
# def vocabulary(self):
|
||||
# return set(self.stoi.keys())
|
||||
#
|
||||
# def dim(self):
|
||||
# return self.dim
|
||||
#
|
||||
# def extract(self, words):
|
||||
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
|
||||
# extraction = torch.zeros((len(words), self.dim))
|
||||
# extraction[source_idx] = self.vectors[target_idx]
|
||||
# return extraction
|
||||
|
||||
|
||||
class EmbeddingsAligned(Vectors):
|
||||
|
||||
def __init__(self, type, path, lang):
|
||||
|
||||
self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
|
||||
def __init__(self, type, path, lang, voc):
|
||||
# todo - rewrite as relative path
|
||||
self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
|
||||
self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
|
||||
self.path = path + self.name.format(lang)
|
||||
assert os.path.exists(path), f'pre-trained vectors not found in {path}'
|
||||
super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
|
||||
self.vectors = self.extract(voc)
|
||||
|
||||
def vocabulary(self):
|
||||
return set(self.stoi.keys())
|
||||
|
|
@ -203,20 +229,69 @@ class FastTextMUSE(PretrainedEmbeddings):
|
|||
return extraction
|
||||
|
||||
|
||||
def embedding_matrix(type, path, voc, lang):
|
||||
vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0])
|
||||
class StorageEmbeddings:
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
self.lang_U = dict()
|
||||
self.lang_S = dict()
|
||||
|
||||
print('[embedding matrix]')
|
||||
print(f'# [pretrained-matrix: {type} {lang}]')
|
||||
pretrained = EmbeddingsAligned(type, path, lang)
|
||||
P = pretrained.extract(vocabulary).numpy()
|
||||
del pretrained
|
||||
print(f'[embedding matrix done] of shape={P.shape}\n')
|
||||
def _add_embeddings_unsupervised(self, type, docs, vocs):
|
||||
for lang in docs.keys():
|
||||
print(f'# [unsupervised-matrix {type}] for {lang}')
|
||||
voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
|
||||
self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
|
||||
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
|
||||
return
|
||||
|
||||
return vocabulary, P
|
||||
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space):
|
||||
for lang in docs.keys():
|
||||
print(f'# [supervised-matrix] for {lang}')
|
||||
# should also pass max_label_space and reduction techniques
|
||||
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space)
|
||||
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
||||
return
|
||||
|
||||
def _concatenate_embeddings(self, docs):
|
||||
_r = dict()
|
||||
for lang in self.lang_U.keys():
|
||||
_r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang])))
|
||||
return _r
|
||||
|
||||
def fit(self, config, docs, vocs, labels):
|
||||
if config['unsupervised']:
|
||||
self._add_embeddings_unsupervised(config['we_type'], docs, vocs)
|
||||
if config['supervised']:
|
||||
self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'])
|
||||
return self
|
||||
|
||||
def predict(self, config, docs):
|
||||
if config['supervised'] and config['unsupervised']:
|
||||
return self._concatenate_embeddings(docs)
|
||||
elif config['supervised']:
|
||||
_r = dict()
|
||||
for lang in docs.keys():
|
||||
_r[lang] = docs[lang].dot(self.lang_S[lang])
|
||||
else:
|
||||
_r = dict()
|
||||
for lang in docs.keys():
|
||||
_r[lang] = docs[lang].dot(self.lang_U[lang])
|
||||
return _r
|
||||
|
||||
|
||||
def WCE_matrix(Xtr, Ytr, lang):
|
||||
# def embedding_matrix(type, path, voc, lang):
|
||||
# vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x: x[1])))[0])
|
||||
#
|
||||
# print('[embedding matrix]')
|
||||
# print(f'# [pretrained-matrix: {type} {lang}]')
|
||||
# pretrained = EmbeddingsAligned(type, path, lang)
|
||||
# P = pretrained.extract(vocabulary).numpy()
|
||||
# del pretrained
|
||||
# print(f'[embedding matrix done] of shape={P.shape}\n')
|
||||
#
|
||||
# return vocabulary, P
|
||||
|
||||
|
||||
def WCE_matrix(Xtr, Ytr, lang, reduction=None, n_components=50):
|
||||
print('\n# [supervised-matrix]')
|
||||
S = get_supervised_embeddings(Xtr[lang], Ytr[lang])
|
||||
print(f'[embedding matrix done] of shape={S.shape}\n')
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
|
||||
# from util.common import *
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.decomposition import PCA, TruncatedSVD
|
||||
from sklearn.manifold import TSNE
|
||||
import numpy as np
|
||||
|
||||
|
||||
|
|
@ -40,7 +40,7 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
|
|||
return F
|
||||
|
||||
|
||||
def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
|
||||
def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
|
||||
print('computing supervised embeddings...')
|
||||
|
||||
nC = Y.shape[1]
|
||||
|
|
@ -60,10 +60,21 @@ def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_probl
|
|||
F = zscores(F, axis=0)
|
||||
|
||||
if nC > max_label_space:
|
||||
if reduction == 'PCA':
|
||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
f'Applying PCA(n_components={max_label_space})')
|
||||
pca = PCA(n_components=max_label_space)
|
||||
F = pca.fit(F).transform(F)
|
||||
elif reduction == 'TSNE':
|
||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
f'Applying t-SNE(n_components={max_label_space})')
|
||||
tsne = TSNE(n_components=max_label_space)
|
||||
F = tsne.fit(F).fit_transform(F)
|
||||
elif reduction == 'tSVD':
|
||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
f'Applying truncatedSVD(n_components={max_label_space})')
|
||||
tSVD = TruncatedSVD(n_components=max_label_space)
|
||||
F = tSVD.fit(F).fit_transform(F)
|
||||
|
||||
return F
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import numpy as np
|
||||
import time
|
||||
from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix
|
||||
from data.embeddings import WordEmbeddings, WCE_matrix, StorageEmbeddings
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
|
@ -458,8 +458,9 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
self.lang_word2idx = dict()
|
||||
self.languages = []
|
||||
self.lang_tfidf = {}
|
||||
self.word_embeddings = {}
|
||||
self.supervised_embeddings = {}
|
||||
# self.word_embeddings = {}
|
||||
# self.supervised_embeddings = {}
|
||||
self.embedding_space = None
|
||||
self.model = None
|
||||
self.time = None
|
||||
|
||||
|
|
@ -492,42 +493,42 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
|
||||
return lZ, lYtr
|
||||
|
||||
def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
|
||||
"""
|
||||
build embedding matrix for given language and returns its weighted sum wrt tf-idf score
|
||||
"""
|
||||
_r = dict()
|
||||
languages = list(lX.keys())
|
||||
|
||||
if prediction:
|
||||
for lang in languages:
|
||||
if unsupervised: # If unsupervised embeddings ...
|
||||
M = self.word_embeddings[lang]
|
||||
if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
|
||||
S = self.supervised_embeddings[lang]
|
||||
_r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
|
||||
continue
|
||||
_r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings
|
||||
else: # If not unsupervised --> get (S) matrix and its weighted sum
|
||||
S = self.supervised_embeddings[lang]
|
||||
_r[lang] = lX[lang].dot(S)
|
||||
return _r
|
||||
|
||||
if unsupervised:
|
||||
for lang in languages:
|
||||
_, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang)
|
||||
self.word_embeddings[lang] = M
|
||||
_r[lang] = lX[lang].dot(M)
|
||||
|
||||
if supervised:
|
||||
for lang in languages:
|
||||
S = WCE_matrix(lX, ly, lang)
|
||||
self.supervised_embeddings[lang] = S
|
||||
if unsupervised:
|
||||
_r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
|
||||
else:
|
||||
_r[lang] = lX[lang].dot(S)
|
||||
return _r
|
||||
# def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
|
||||
# """
|
||||
# build embedding matrix for given language and returns its weighted sum wrt tf-idf score
|
||||
# """
|
||||
# _r = dict()
|
||||
# languages = list(lX.keys())
|
||||
#
|
||||
# if prediction:
|
||||
# for lang in languages:
|
||||
# if unsupervised: # If unsupervised embeddings ...
|
||||
# M = self.word_embeddings[lang]
|
||||
# if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
|
||||
# S = self.supervised_embeddings[lang]
|
||||
# _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
|
||||
# continue
|
||||
# _r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings
|
||||
# else: # If not unsupervised --> get (S) matrix and its weighted sum
|
||||
# S = self.supervised_embeddings[lang]
|
||||
# _r[lang] = lX[lang].dot(S)
|
||||
# return _r
|
||||
#
|
||||
# if unsupervised:
|
||||
# for lang in languages:
|
||||
# _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang)
|
||||
# self.word_embeddings[lang] = M
|
||||
# _r[lang] = lX[lang].dot(M)
|
||||
#
|
||||
# if supervised:
|
||||
# for lang in languages:
|
||||
# S = WCE_matrix(lX, ly, lang)
|
||||
# self.supervised_embeddings[lang] = S
|
||||
# if unsupervised:
|
||||
# _r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
|
||||
# else:
|
||||
# _r[lang] = lX[lang].dot(S)
|
||||
# return _r
|
||||
|
||||
# @override std class method
|
||||
def fit(self, lX, ly):
|
||||
|
|
@ -541,17 +542,11 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
Z, zy = self._get_zspace(lX, ly)
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
# Z vectors is concatenated with doc's embedding weighted sum
|
||||
Z_embedded = dict()
|
||||
l_weighted_em = self.embed(lX, ly,
|
||||
unsupervised=self.config['unsupervised'],
|
||||
supervised=self.config['supervised'])
|
||||
|
||||
# stacking Z space horizontally with unsupervised (M) and/or supervised (F) embeddings
|
||||
for lang in list(lX.keys()):
|
||||
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
|
||||
Z = Z_embedded
|
||||
|
||||
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
|
||||
_embedding_space = self.embedding_space.predict(self.config, lX)
|
||||
# h_stacking posterior probabilities with (U) and/or (S) matrices
|
||||
for lang in self.languages:
|
||||
Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
|
||||
|
||||
# stacking Z space vertically
|
||||
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
||||
|
|
@ -573,14 +568,15 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
lZ = self._projection(self.doc_projector, lX)
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
l_weighted_em = self.embed(lX, ly,
|
||||
unsupervised=self.config['unsupervised'],
|
||||
supervised=self.config['supervised'],
|
||||
prediction=True)
|
||||
Z_embedded = dict()
|
||||
_embedding_space = self.embedding_space.predict(self.config, lX)
|
||||
# l_weighted_em = self.embed(lX, ly,
|
||||
# unsupervised=self.config['unsupervised'],
|
||||
# supervised=self.config['supervised'],
|
||||
# prediction=True)
|
||||
# Z_embedded = dict()
|
||||
for lang in lX.keys():
|
||||
Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang]))
|
||||
lZ = Z_embedded
|
||||
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
|
||||
# lZ = Z_embedded
|
||||
|
||||
for lang in lZ.keys():
|
||||
print(lZ[lang].shape)
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ class StandardizeTransformer:
|
|||
self.std = np.clip(std, 1e-5, None)
|
||||
self.mean = np.mean(X, axis=self.axis)
|
||||
self.yetfit=True
|
||||
print('done')
|
||||
print('done\n')
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
|
|
|
|||
Loading…
Reference in New Issue