diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 4decdf6..1618c33 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -1,4 +1,4 @@ -import os, sys +import os from dataset_builder import MultilingualDataset from learning.learners import * from util.evaluation import * @@ -11,32 +11,46 @@ from sklearn.svm import SVC parser = OptionParser() parser.add_option("-d", "--dataset", dest="dataset", - help="Path to the multilingual dataset processed and stored in .pickle format") + help="Path to the multilingual dataset processed and stored in .pickle format", + default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") parser.add_option("-o", "--output", dest="output", help="Result file", type=str, default='./results/results.csv') parser.add_option("-e", "--mode-embed", dest="mode_embed", - help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none') + help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none') parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the polylingual word embeddings", default='../embeddings/') + help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/') + +parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str, + default='MUSE') parser.add_option("-s", "--set_c", dest="set_c",type=float, help="Set the C parameter", default=1) parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimices hyperparameters", default=False) + help="Optimize hyperparameters", default=False) parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, help="Number of parallel jobs (default is -1, all)", default=-1) +parser.add_option("-p", "--pca", dest="max_labels", type=int, + help="If less than number of target classes, will apply PCA to supervised matrix. If set to 0 it" + " will automatically search for the best number of components", default=300) + +parser.add_option("-u", "--upca", dest="max_labels_U", type=int, + help="If smaller than Unsupervised Dimension, will apply PCA to unsupervised matrix. If set to 0 it" + " will automatically search for the best number of components", default=300) + +parser.add_option("-l", dest="lang", type=str) + def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced') + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') -def get_params(dense=False): # TODO kernel function could be useful for meta-classifier +def get_params(dense=False): if not op.optimc: return None c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] @@ -47,7 +61,6 @@ def get_params(dense=False): # TODO kernel function could be useful for meta- if __name__ == '__main__': - (op, args) = parser.parse_args() assert exists(op.dataset), 'Unable to find file '+str(op.dataset) @@ -60,7 +73,9 @@ if __name__ == '__main__': data = MultilingualDataset.load(op.dataset) data.show_dimensions() - # data.set_view(languages=['en','it'], categories=list(range(10))) + data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10))) + # data.set_view(languages=[op.lang]) + # data.set_view(categories=list(range(10))) lXtr, lytr = data.training() lXte, lyte = data.test() @@ -72,30 +87,42 @@ if __name__ == '__main__': # Embeddings and WCE config _available_mode = ['none', 'unsupervised', 'supervised', 'both'] - assert op.mode_embed in _available_mode , f'{op.mode_embed} not in {_available_mode}' + _available_type = ['MUSE', 'FastText'] + assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}' + assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}' if op.mode_embed == 'none': config = {'unsupervised': False, - 'supervised': False} + 'supervised': False, + 'we_type': None} _config_id = 'None' elif op.mode_embed == 'unsupervised': config = {'unsupervised': True, - 'supervised': False} + 'supervised': False, + 'we_type': op.we_type} _config_id = 'M' elif op.mode_embed == 'supervised': config = {'unsupervised': False, - 'supervised': True} + 'supervised': True, + 'we_type': None} _config_id = 'F' elif op.mode_embed == 'both': config = {'unsupervised': True, - 'supervised': True} + 'supervised': True, + 'we_type': op.we_type} _config_id = 'M_and_F' + ##### TODO - config dict is redundant - we have already op argparse ... + config['reduction'] = 'PCA' + config['max_label_space'] = op.max_labels + config['dim_reduction_unsupervised'] = op.max_labels_U + # config['plot_covariance_matrices'] = True + result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') print(f'### PolyEmbedd_andrea_{_config_id}\n') - classifier = AndreaCLF(op.we_path, - config, + classifier = AndreaCLF(we_path=op.we_path, + config=config, first_tier_learner=get_learner(calibrate=True), meta_learner=get_learner(calibrate=False, kernel='rbf'), first_tier_parameters=get_params(dense=False), @@ -105,7 +132,7 @@ if __name__ == '__main__': print('# Fitting ...') classifier.fit(lXtr, lytr) - print('# Evaluating ...') + print('\n# Evaluating ...') l_eval = evaluate_method(classifier, lXte, lyte) metrics = [] @@ -113,6 +140,6 @@ if __name__ == '__main__': macrof1, microf1, macrok, microk = l_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1], - 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope') + results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1], + classifier.time, lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 58a0b64..50c09de 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -1,10 +1,10 @@ import os import pickle -import numpy as np from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod from data.supervised import get_supervised_embeddings +from util.decompositions import * class PretrainedEmbeddings(ABC): @@ -110,10 +110,10 @@ class WordEmbeddings: # vocabulary is a set of terms to be kept active_vocabulary = sorted([w for w in vocabulary if w in self.worddim]) lost = len(vocabulary)-len(active_vocabulary) - if lost>0: #some termr are missing, so it will be replaced by UNK + if lost > 0: # some terms are missing, so it will be replaced by UNK print('warning: missing {} terms for lang {}'.format(lost, self.lang)) self.we = self.get_vectors(active_vocabulary) - assert self.we.shape[0]==len(active_vocabulary) + assert self.we.shape[0] == len(active_vocabulary) self.dimword={i:w for i,w in enumerate(active_vocabulary)} self.worddim={w:i for i,w in enumerate(active_vocabulary)} return self @@ -132,12 +132,12 @@ class WordEmbeddings: 'instances of {} expected'.format(WordEmbeddings.__name__) polywe = [] - worddim={} - offset=0 + worddim = {} + offset = 0 for we in we_list: polywe.append(we.we) worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()}) - offset=len(worddim) + offset = len(worddim) polywe = np.vstack(polywe) return WordEmbeddings(lang='poly', we=polywe, worddim=worddim) @@ -147,16 +147,41 @@ class FastTextWikiNews(Vectors): url_base = 'Cant auto-download MUSE embeddings' path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec' - _name = 'wiki.multi.{}.vec' + _name = '/embeddings/wiki.multi.{}.vec' def __init__(self, cache, language="en", **kwargs): url = self.url_base.format(language) # name = self.path.format(language) name = cache + self._name.format(language) - # print(f'\n\nFASTEXTWIKI-NEW CLASS:\nurl = {url}\nname = {name}\ncache {cache}\nlanguage = {language}') super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) +class EmbeddingsAligned(Vectors): + + def __init__(self, type, path, lang, voc): + # todo - rewrite as relative path + self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' + self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT' + self.path = path + self.name.format(lang) + assert os.path.exists(path), f'pre-trained vectors not found in {path}' + super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path) + self.vectors = self.extract(voc) + + def vocabulary(self): + return set(self.stoi.keys()) + + def extract(self, words): + source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi) + extraction = torch.zeros((len(words), self.dim)) + extraction[source_idx] = self.vectors[target_idx] + return extraction + + def reduce(self, dim): + pca = PCA(n_components=dim) + self.vectors = pca.fit_transform(self.vectors) + return + + class FastTextMUSE(PretrainedEmbeddings): def __init__(self, path, lang, limit=None): @@ -164,7 +189,6 @@ class FastTextMUSE(PretrainedEmbeddings): print(f'Loading fastText pretrained vectors from {path}') assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') self.embed = FastTextWikiNews(path, lang, max_vectors=limit) - # print('Done') def vocabulary(self): return set(self.embed.stoi.keys()) @@ -179,21 +203,76 @@ class FastTextMUSE(PretrainedEmbeddings): return extraction -def embedding_matrix(path, voc, lang): - vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0]) +class StorageEmbeddings: + def __init__(self, path): + self.path = path + self.lang_U = dict() + self.lang_S = dict() - print('[embedding matrix]') - print(f'# [pretrained-matrix: FastTextMUSE {lang}]') - pretrained = FastTextMUSE(path, lang) - P = pretrained.extract(vocabulary).numpy() - del pretrained - print(f'[embedding matrix done] of shape={P.shape}\n') + def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300): + for lang in docs.keys(): + nC = self.lang_U[lang].shape[1] + print(f'# [unsupervised-matrix {type}] for {lang}') + voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0]) + self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors + # if self.lang_U[lang].shape[1] > dim != 0: + # print(f'unsupervised matrix has more dimensions ({self.lang_U[lang].shape[1]}) than' + # f' the allowed limit {dim}. Applying PCA(n_components={dim})') + # pca = PCA(n_components=dim) + # self.lang_U[lang] = pca.fit_transform(self.lang_U[lang]) + print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n') + if max_label_space == 0: + print(f'Computing optimal number of PCA components along matrices U') + optimal_n = get_optimal_dim(self.lang_U, 'U') + self.lang_U = run_pca(optimal_n, self.lang_U) + elif max_label_space < nC: + self.lang_U = run_pca(max_label_space, self.lang_U) - return vocabulary, P + return + + def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc): + # if max_label_space == 0: + # print('Computing optimal number of PCA components along matrices S...') + # optimal_n = self.get_optimal_supervised_components(docs, labels) + # max_label_space = optimal_n + for lang in docs.keys(): # compute supervised matrices S - then apply PCA + nC = self.lang_S[lang].shape[1] + print(f'# [supervised-matrix] for {lang}') + self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang) + print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') + + if max_label_space == 0: + optimal_n = get_optimal_dim(self.lang_S, 'S') + self.lang_S = run_pca(optimal_n, self.lang_S) + elif max_label_space < nC: + self.lang_S = run_pca(max_label_space, self.lang_S) + + return + + def _concatenate_embeddings(self, docs): + _r = dict() + for lang in self.lang_U.keys(): + _r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang]))) + return _r + + def fit(self, config, docs, vocs, labels): + if config['unsupervised']: + self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised']) + if config['supervised']: + self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs) + return self -def WCE_matrix(Xtr, Ytr, lang): - print('\n# [supervised-matrix]') - S = get_supervised_embeddings(Xtr[lang], Ytr[lang], max_label_space=50) - print(f'[embedding matrix done] of shape={S.shape}\n') - return S + def predict(self, config, docs): + if config['supervised'] and config['unsupervised']: + return self._concatenate_embeddings(docs) + elif config['supervised']: + _r = dict() + for lang in docs.keys(): + _r[lang] = docs[lang].dot(self.lang_S[lang]) + else: + _r = dict() + for lang in docs.keys(): + _r[lang] = docs[lang].dot(self.lang_U[lang]) + return _r + diff --git a/src/data/supervised.py b/src/data/supervised.py index 4ed7f59..b1faa2d 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -1,7 +1,7 @@ from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square -# from util.common import * -from sklearn.decomposition import PCA import numpy as np +# from sklearn.decomposition import PCA +# from sklearn.manifold import TSNE def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur @@ -41,12 +41,10 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents= return F -def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True): - print('computing supervised embeddings...') - +def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True): + if max_label_space != 0: + print('computing supervised embeddings...') nC = Y.shape[1] - if nC==2 and binary_structural_problems > nC: - raise ValueError('not implemented in this branch') if method=='ppmi': F = supervised_embeddings_ppmi(X, Y) @@ -60,14 +58,41 @@ def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_probl if dozscore: F = zscores(F, axis=0) - if nC > max_label_space: - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying PCA(n_components={max_label_space})') - pca = PCA(n_components=max_label_space) - F = pca.fit(F).transform(F) + # Dumping F-matrix for further studies + dump_it = False + if dump_it: + with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile: + np.savetxt(outfile, F, delimiter='\t') + with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile: + for token in voc.keys(): + outfile.write(token+'\n') return F + # if nC >= max_label_space: + # if reduction == 'PCA': + # if max_label_space == 0: + # pca = PCA(n_components=Y.shape[1]) + # pca = pca.fit(F) + # return pca.explained_variance_ratio_ + # + # print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + # f'Applying PCA(n_components={max_label_space})') + # pca = PCA(n_components=max_label_space) + # F = pca.fit_transform(F) + # elif reduction == 'TSNE': + # print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + # f'Applying t-SNE(n_components={max_label_space})') + # tsne = TSNE(n_components=max_label_space) + # F = tsne.fit_transform(F) + # elif reduction == 'tSVD': + # print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + # f'Applying truncatedSVD(n_components={max_label_space})') + # tSVD = TruncatedSVD(n_components=max_label_space) + # F = tSVD.fit_transform(F) + # + # return F + diff --git a/src/dataset_builder.py b/src/dataset_builder.py index 3f6732c..9af7b3f 100644 --- a/src/dataset_builder.py +++ b/src/dataset_builder.py @@ -11,6 +11,8 @@ import numpy as np from sklearn.model_selection import train_test_split from scipy.sparse import issparse import itertools +from tqdm import tqdm +import re class MultilingualDataset: @@ -73,10 +75,14 @@ class MultilingualDataset: return self.lXte(), self.lYte() def lXtr(self): - return {lang:Xtr for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} + return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if + lang in self.langs()} + # return {lang:self.mask_numbers(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} def lXte(self): - return {lang:Xte for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} + return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if + lang in self.langs()} + # return {lang:self.mask_numbers(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} def lYtr(self): return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} @@ -129,6 +135,13 @@ class MultilingualDataset: def set_labels(self, labels): self.labels = labels + def mask_numbers(self, data, number_mask='numbermask'): + mask = re.compile(r'\b[0-9][0-9.,-]*\b') + masked = [] + for text in tqdm(data, desc='masking numbers'): + masked.append(mask.sub(number_mask, text)) + return masked + # ---------------------------------------------------------------------------------------------------------------------- # Helpers diff --git a/src/learning/learners.py b/src/learning/learners.py index 5a8f07e..96e200c 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -1,15 +1,14 @@ import numpy as np import time -from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix +from data.embeddings import WordEmbeddings, StorageEmbeddings from scipy.sparse import issparse from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer - -from data.supervised import zscores from transformers.StandardizeTransformer import StandardizeTransformer +# from sklearn.decomposition import PCA def _sort_if_sparse(X): @@ -444,7 +443,8 @@ class AndreaCLF(FunnellingPolylingualClassifier): first_tier_parameters=None, meta_parameters=None, folded_projections=1, - calmode='cal', n_jobs=-1): + calmode='cal', + n_jobs=-1): super().__init__(first_tier_learner, meta_learner, @@ -454,13 +454,13 @@ class AndreaCLF(FunnellingPolylingualClassifier): calmode, n_jobs) + self.pca_independent_space = PCA(n_components=100) self.we_path = we_path self.config = config self.lang_word2idx = dict() self.languages = [] self.lang_tfidf = {} - self.word_embeddings = {} - self.supervised_embeddings = {} + self.embedding_space = None self.model = None self.time = None @@ -479,9 +479,8 @@ class AndreaCLF(FunnellingPolylingualClassifier): self.languages.append(lang) tfidf_vectorizer.fit(lX[lang]) lX[lang] = tfidf_vectorizer.transform(lX[lang]) - _sort_if_sparse(lX[lang]) self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ - self.lang_tfidf[lang] = tfidf_vectorizer # utile in fase di testing + self.lang_tfidf[lang] = tfidf_vectorizer return self # @override std class method @@ -494,45 +493,6 @@ class AndreaCLF(FunnellingPolylingualClassifier): return lZ, lYtr - def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False): - """ - build embedding matrix for given language and returns its weighted sum wrt tf-idf score - """ - _r = dict() - languages = list(lX.keys()) - - if prediction: - for lang in languages: - if unsupervised: # If unsupervised embeddings ... - M = self.word_embeddings[lang] - if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them - S = self.supervised_embeddings[lang] - _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S))) - continue - _r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings - else: # If not unsupervised --> get (S) matrix and its weighted sum - S = self.supervised_embeddings[lang] - _r[lang] = lX[lang].dot(S) - return _r - - if unsupervised: - for lang in languages: - # print('Test building embedding matrix FastTextMuse ...') - _, M = embedding_matrix(self.we_path, self.lang_word2idx[lang], lang) - self.word_embeddings[lang] = M - _r[lang] = lX[lang].dot(M) - - if supervised: - for lang in languages: - S = WCE_matrix(lX, ly, lang) - # S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging - self.supervised_embeddings[lang] = S - if unsupervised: - _r[lang] = np.hstack((_r[lang], lX[lang].dot(S))) - else: - _r[lang] = lX[lang].dot(S) - return _r - # @override std class method def fit(self, lX, ly): tinit = time.time() @@ -545,24 +505,22 @@ class AndreaCLF(FunnellingPolylingualClassifier): Z, zy = self._get_zspace(lX, ly) if self.config['supervised'] or self.config['unsupervised']: - # Z vectors is concatenated with doc's embedding weighted sum - Z_embedded = dict() - l_weighted_em = self.embed(lX, ly, - unsupervised=self.config['unsupervised'], - supervised=self.config['supervised']) - - # stacking Z space horizontally with unsupervised (M) and/or supervised (F) embeddings - for lang in list(lX.keys()): - Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang])) - Z = Z_embedded - + self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) + _embedding_space = self.embedding_space.predict(self.config, lX) + # h_stacking posterior probabilities with (U) and/or (S) matrices + for lang in self.languages: + Z[lang] = np.hstack((Z[lang], _embedding_space[lang])) # stacking Z space vertically _vertical_Z = np.vstack([Z[lang] for lang in self.languages]) _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) + # todo testing ... + # self.pca_independent_space.fit(_vertical_Z) + # _vertical_Z = self.pca_independent_space.transform(_vertical_Z) + self.standardizer = StandardizeTransformer() - _vertical_Z = self.standardizer.fit_predict(_vertical_Z) + _vertical_Z = self.standardizer.fit_predict(_vertical_Z) print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, @@ -577,17 +535,15 @@ class AndreaCLF(FunnellingPolylingualClassifier): lZ = self._projection(self.doc_projector, lX) if self.config['supervised'] or self.config['unsupervised']: - l_weighted_em = self.embed(lX, ly, - unsupervised=self.config['unsupervised'], - supervised=self.config['supervised'], - prediction=True) - Z_embedded = dict() + _embedding_space = self.embedding_space.predict(self.config, lX) + for lang in lX.keys(): - Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang])) - lZ = Z_embedded + lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang])) for lang in lZ.keys(): print(lZ[lang].shape) + # todo testing + # lZ[lang] = self.pca_independent_space.transform(lZ[lang]) lZ[lang] = self.standardizer.predict(lZ[lang]) return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) diff --git a/src/results/results.csv b/src/results/results.csv deleted file mode 100644 index 783225c..0000000 --- a/src/results/results.csv +++ /dev/null @@ -1,7 +0,0 @@ -id method learner embed optimp dataset binary languages time lang macrof1 microf1 macrok microk notes -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 en 0.7866666666666666 0.0 0.7927111111111111 -0.0003376325207643527 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 fr 0.7866666666666666 0.0 0.7930666666666667 -0.0001350530083057411 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 en 0.7933333333333333 0.0 0.7931111111111111 -0.00013505300830574107 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 fr 0.7933333333333333 0.0 0.7933333333333333 0.0 nope diff --git a/src/transformers/StandardizeTransformer.py b/src/transformers/StandardizeTransformer.py index 381d6c1..e776db7 100644 --- a/src/transformers/StandardizeTransformer.py +++ b/src/transformers/StandardizeTransformer.py @@ -12,7 +12,7 @@ class StandardizeTransformer: self.std = np.clip(std, 1e-5, None) self.mean = np.mean(X, axis=self.axis) self.yetfit=True - print('done') + print('done\n') return self def predict(self, X): @@ -20,4 +20,4 @@ class StandardizeTransformer: return (X - self.mean) / self.std def fit_predict(self, X): - return self.fit(X).predict(X) \ No newline at end of file + return self.fit(X).predict(X) diff --git a/src/util/decompositions.py b/src/util/decompositions.py new file mode 100644 index 0000000..9029b33 --- /dev/null +++ b/src/util/decompositions.py @@ -0,0 +1,49 @@ +from sklearn.decomposition import PCA +import numpy as np +import matplotlib.pyplot as plt + +def run_pca(dim, X): + """ + :param dim: number of pca components to keep + :param X: dictionary str(lang): matrix + :return: dict lang: reduced matrix + """ + r = dict() + pca = PCA(n_components=dim) + for lang in X.keys(): + r[lang] = pca.fit_transform(X[lang]) + return r + + +def get_optimal_dim(X, embed_type): + """ + :param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised + :param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT) + :return: + """ + _idx = [] + + plt.figure(figsize=(15, 10)) + if embed_type == 'U': + plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance') + else: + plt.title(f'WCE Explained Variance') + plt.xlabel('Number of Components') + plt.ylabel('Variance (%)') + + for lang in X.keys(): + pca = PCA(n_components=X[lang].shape[1]) + pca.fit(X[lang]) + _r = pca.explained_variance_ratio_ + _r = np.cumsum(_r) + plt.plot(_r, label=lang) + for i in range(len(_r) - 1, 1, -1): + delta = _r[i] - _r[i - 1] + if delta > 0: + _idx.append(i) + break + best_n = max(_idx) + plt.axvline(best_n, color='r', label='optimal N') + plt.legend() + plt.show() + return best_n \ No newline at end of file diff --git a/src/util/results.py b/src/util/results.py index 43529b4..7c25bec 100644 --- a/src/util/results.py +++ b/src/util/results.py @@ -5,7 +5,7 @@ import numpy as np class PolylingualClassificationResults: def __init__(self, file, autoflush=True, verbose=False): self.file = file - self.columns = ['id', 'method', 'learner', 'embed', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] + self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] self.autoflush = autoflush self.verbose = verbose if os.path.exists(file): @@ -20,8 +20,8 @@ class PolylingualClassificationResults: def already_calculated(self, id): return (self.df['id'] == id).any() - def add_row(self, id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) + def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): + s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) self.df = self.df.append(s, ignore_index=True) if self.autoflush: self.flush() self.tell(s.to_string())