diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index f8edfad..9be7c42 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -11,7 +11,8 @@ from sklearn.svm import SVC parser = OptionParser() parser.add_option("-d", "--dataset", dest="dataset", - help="Path to the multilingual dataset processed and stored in .pickle format") + help="Path to the multilingual dataset processed and stored in .pickle format", + default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") parser.add_option("-o", "--output", dest="output", help="Result file", type=str, default='./results/results.csv') @@ -23,7 +24,7 @@ parser.add_option("-w", "--we-path", dest="we_path", help="Path to the polylingual word embeddings", default='../embeddings/') parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str, - default='FastText') + default='MUSE') parser.add_option("-s", "--set_c", dest="set_c",type=float, help="Set the C parameter", default=1) @@ -36,7 +37,7 @@ parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced') + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') def get_params(dense=False): @@ -64,6 +65,7 @@ if __name__ == '__main__': data.show_dimensions() # data.set_view(languages=['en','it'], categories=list(range(10))) + # data.set_view(languages=['en','it']) lXtr, lytr = data.training() lXte, lyte = data.test() @@ -100,6 +102,10 @@ if __name__ == '__main__': 'we_type': op.we_type} _config_id = 'M_and_F' + ##### TODO - config dict is redundant - we have already op argparse ... + config['reduction'] = 'tSVD' + config['max_label_space'] = 50 + result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') print(f'### PolyEmbedd_andrea_{_config_id}\n') @@ -114,7 +120,7 @@ if __name__ == '__main__': print('# Fitting ...') classifier.fit(lXtr, lytr) - print('# Evaluating ...') + print('\n# Evaluating ...') l_eval = evaluate_method(classifier, lXte, lyte) metrics = [] diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 0598feb..66a14d0 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -5,6 +5,7 @@ from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod from data.supervised import get_supervised_embeddings +from sklearn.decomposition import PCA class PretrainedEmbeddings(ABC): @@ -157,16 +158,41 @@ class FastTextWikiNews(Vectors): super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) +# class EmbeddingsAligned(Vectors): +# +# def __init__(self, type, path, lang): +# +# self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' +# # todo - rewrite as relative path +# self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT' +# self.path = path + self.name.format(lang) +# assert os.path.exists(path), f'pre-trained vectors not found in {path}' +# super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path) +# # self.vectors = self.extract(voc) +# +# def vocabulary(self): +# return set(self.stoi.keys()) +# +# def dim(self): +# return self.dim +# +# def extract(self, words): +# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi) +# extraction = torch.zeros((len(words), self.dim)) +# extraction[source_idx] = self.vectors[target_idx] +# return extraction + + class EmbeddingsAligned(Vectors): - def __init__(self, type, path, lang): - - self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' + def __init__(self, type, path, lang, voc): # todo - rewrite as relative path + self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT' self.path = path + self.name.format(lang) assert os.path.exists(path), f'pre-trained vectors not found in {path}' super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path) + self.vectors = self.extract(voc) def vocabulary(self): return set(self.stoi.keys()) @@ -203,20 +229,69 @@ class FastTextMUSE(PretrainedEmbeddings): return extraction -def embedding_matrix(type, path, voc, lang): - vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0]) +class StorageEmbeddings: + def __init__(self, path): + self.path = path + self.lang_U = dict() + self.lang_S = dict() - print('[embedding matrix]') - print(f'# [pretrained-matrix: {type} {lang}]') - pretrained = EmbeddingsAligned(type, path, lang) - P = pretrained.extract(vocabulary).numpy() - del pretrained - print(f'[embedding matrix done] of shape={P.shape}\n') + def _add_embeddings_unsupervised(self, type, docs, vocs): + for lang in docs.keys(): + print(f'# [unsupervised-matrix {type}] for {lang}') + voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0]) + self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors + print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n') + return - return vocabulary, P + def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space): + for lang in docs.keys(): + print(f'# [supervised-matrix] for {lang}') + # should also pass max_label_space and reduction techniques + self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space) + print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') + return + + def _concatenate_embeddings(self, docs): + _r = dict() + for lang in self.lang_U.keys(): + _r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang]))) + return _r + + def fit(self, config, docs, vocs, labels): + if config['unsupervised']: + self._add_embeddings_unsupervised(config['we_type'], docs, vocs) + if config['supervised']: + self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space']) + return self + + def predict(self, config, docs): + if config['supervised'] and config['unsupervised']: + return self._concatenate_embeddings(docs) + elif config['supervised']: + _r = dict() + for lang in docs.keys(): + _r[lang] = docs[lang].dot(self.lang_S[lang]) + else: + _r = dict() + for lang in docs.keys(): + _r[lang] = docs[lang].dot(self.lang_U[lang]) + return _r -def WCE_matrix(Xtr, Ytr, lang): +# def embedding_matrix(type, path, voc, lang): +# vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x: x[1])))[0]) +# +# print('[embedding matrix]') +# print(f'# [pretrained-matrix: {type} {lang}]') +# pretrained = EmbeddingsAligned(type, path, lang) +# P = pretrained.extract(vocabulary).numpy() +# del pretrained +# print(f'[embedding matrix done] of shape={P.shape}\n') +# +# return vocabulary, P + + +def WCE_matrix(Xtr, Ytr, lang, reduction=None, n_components=50): print('\n# [supervised-matrix]') S = get_supervised_embeddings(Xtr[lang], Ytr[lang]) print(f'[embedding matrix done] of shape={S.shape}\n') diff --git a/src/data/supervised.py b/src/data/supervised.py index 5f97e7f..b3c4fb9 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -1,6 +1,6 @@ from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square -# from util.common import * -from sklearn.decomposition import PCA +from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.manifold import TSNE import numpy as np @@ -40,7 +40,7 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents= return F -def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True): +def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True): print('computing supervised embeddings...') nC = Y.shape[1] @@ -60,10 +60,21 @@ def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_probl F = zscores(F, axis=0) if nC > max_label_space: - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying PCA(n_components={max_label_space})') - pca = PCA(n_components=max_label_space) - F = pca.fit(F).transform(F) + if reduction == 'PCA': + print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + f'Applying PCA(n_components={max_label_space})') + pca = PCA(n_components=max_label_space) + F = pca.fit(F).transform(F) + elif reduction == 'TSNE': + print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + f'Applying t-SNE(n_components={max_label_space})') + tsne = TSNE(n_components=max_label_space) + F = tsne.fit(F).fit_transform(F) + elif reduction == 'tSVD': + print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + f'Applying truncatedSVD(n_components={max_label_space})') + tSVD = TruncatedSVD(n_components=max_label_space) + F = tSVD.fit(F).fit_transform(F) return F diff --git a/src/learning/learners.py b/src/learning/learners.py index d01c734..89bda7e 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -1,6 +1,6 @@ import numpy as np import time -from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix +from data.embeddings import WordEmbeddings, WCE_matrix, StorageEmbeddings from scipy.sparse import issparse from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import GridSearchCV @@ -458,8 +458,9 @@ class AndreaCLF(FunnellingPolylingualClassifier): self.lang_word2idx = dict() self.languages = [] self.lang_tfidf = {} - self.word_embeddings = {} - self.supervised_embeddings = {} + # self.word_embeddings = {} + # self.supervised_embeddings = {} + self.embedding_space = None self.model = None self.time = None @@ -492,42 +493,42 @@ class AndreaCLF(FunnellingPolylingualClassifier): return lZ, lYtr - def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False): - """ - build embedding matrix for given language and returns its weighted sum wrt tf-idf score - """ - _r = dict() - languages = list(lX.keys()) - - if prediction: - for lang in languages: - if unsupervised: # If unsupervised embeddings ... - M = self.word_embeddings[lang] - if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them - S = self.supervised_embeddings[lang] - _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S))) - continue - _r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings - else: # If not unsupervised --> get (S) matrix and its weighted sum - S = self.supervised_embeddings[lang] - _r[lang] = lX[lang].dot(S) - return _r - - if unsupervised: - for lang in languages: - _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang) - self.word_embeddings[lang] = M - _r[lang] = lX[lang].dot(M) - - if supervised: - for lang in languages: - S = WCE_matrix(lX, ly, lang) - self.supervised_embeddings[lang] = S - if unsupervised: - _r[lang] = np.hstack((_r[lang], lX[lang].dot(S))) - else: - _r[lang] = lX[lang].dot(S) - return _r + # def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False): + # """ + # build embedding matrix for given language and returns its weighted sum wrt tf-idf score + # """ + # _r = dict() + # languages = list(lX.keys()) + # + # if prediction: + # for lang in languages: + # if unsupervised: # If unsupervised embeddings ... + # M = self.word_embeddings[lang] + # if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them + # S = self.supervised_embeddings[lang] + # _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S))) + # continue + # _r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings + # else: # If not unsupervised --> get (S) matrix and its weighted sum + # S = self.supervised_embeddings[lang] + # _r[lang] = lX[lang].dot(S) + # return _r + # + # if unsupervised: + # for lang in languages: + # _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang) + # self.word_embeddings[lang] = M + # _r[lang] = lX[lang].dot(M) + # + # if supervised: + # for lang in languages: + # S = WCE_matrix(lX, ly, lang) + # self.supervised_embeddings[lang] = S + # if unsupervised: + # _r[lang] = np.hstack((_r[lang], lX[lang].dot(S))) + # else: + # _r[lang] = lX[lang].dot(S) + # return _r # @override std class method def fit(self, lX, ly): @@ -541,17 +542,11 @@ class AndreaCLF(FunnellingPolylingualClassifier): Z, zy = self._get_zspace(lX, ly) if self.config['supervised'] or self.config['unsupervised']: - # Z vectors is concatenated with doc's embedding weighted sum - Z_embedded = dict() - l_weighted_em = self.embed(lX, ly, - unsupervised=self.config['unsupervised'], - supervised=self.config['supervised']) - - # stacking Z space horizontally with unsupervised (M) and/or supervised (F) embeddings - for lang in list(lX.keys()): - Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang])) - Z = Z_embedded - + self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) + _embedding_space = self.embedding_space.predict(self.config, lX) + # h_stacking posterior probabilities with (U) and/or (S) matrices + for lang in self.languages: + Z[lang] = np.hstack((Z[lang], _embedding_space[lang])) # stacking Z space vertically _vertical_Z = np.vstack([Z[lang] for lang in self.languages]) @@ -573,14 +568,15 @@ class AndreaCLF(FunnellingPolylingualClassifier): lZ = self._projection(self.doc_projector, lX) if self.config['supervised'] or self.config['unsupervised']: - l_weighted_em = self.embed(lX, ly, - unsupervised=self.config['unsupervised'], - supervised=self.config['supervised'], - prediction=True) - Z_embedded = dict() + _embedding_space = self.embedding_space.predict(self.config, lX) + # l_weighted_em = self.embed(lX, ly, + # unsupervised=self.config['unsupervised'], + # supervised=self.config['supervised'], + # prediction=True) + # Z_embedded = dict() for lang in lX.keys(): - Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang])) - lZ = Z_embedded + lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang])) + # lZ = Z_embedded for lang in lZ.keys(): print(lZ[lang].shape) diff --git a/src/transformers/StandardizeTransformer.py b/src/transformers/StandardizeTransformer.py index 45921b7..e776db7 100644 --- a/src/transformers/StandardizeTransformer.py +++ b/src/transformers/StandardizeTransformer.py @@ -12,7 +12,7 @@ class StandardizeTransformer: self.std = np.clip(std, 1e-5, None) self.mean = np.mean(X, axis=self.axis) self.yetfit=True - print('done') + print('done\n') return self def predict(self, X):