From f2083bf22a734df8390e37bce7df7b782012c5da Mon Sep 17 00:00:00 2001 From: andrea Date: Sat, 30 Nov 2019 19:14:51 +0100 Subject: [PATCH 01/10] reworked unsupervised (aligned) embeddings loader method and class (fastText and MUSE). new op.arg -t ['MUSE', 'FastText'] uploaded /results/results.csv (on rcv1 ... run0.pickle) obtained on all available setup. TODO: refactor it also as a standalone class with its own load/weighted sum/extract/reduce methods. --- src/FPEC_andrea.py | 29 +++++++---- src/data/embeddings.py | 32 ++++++++++-- src/learning/learners.py | 14 ++--- src/results/results.csv | 60 +++++++++++++++++++--- src/transformers/StandardizeTransformer.py | 2 +- 5 files changed, 107 insertions(+), 30 deletions(-) diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 4decdf6..7092d2b 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -17,11 +17,14 @@ parser.add_option("-o", "--output", dest="output", help="Result file", type=str, default='./results/results.csv') parser.add_option("-e", "--mode-embed", dest="mode_embed", - help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none') + help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none') parser.add_option("-w", "--we-path", dest="we_path", help="Path to the polylingual word embeddings", default='../embeddings/') +parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str, + default='FastText') + parser.add_option("-s", "--set_c", dest="set_c",type=float, help="Set the C parameter", default=1) @@ -36,7 +39,7 @@ def get_learner(calibrate=False, kernel='linear'): return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced') -def get_params(dense=False): # TODO kernel function could be useful for meta-classifier +def get_params(dense=False): if not op.optimc: return None c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] @@ -72,30 +75,36 @@ if __name__ == '__main__': # Embeddings and WCE config _available_mode = ['none', 'unsupervised', 'supervised', 'both'] - assert op.mode_embed in _available_mode , f'{op.mode_embed} not in {_available_mode}' + _available_type = ['MUSE', 'FastText'] + assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}' + assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}' if op.mode_embed == 'none': config = {'unsupervised': False, - 'supervised': False} + 'supervised': False, + 'we_type': None} _config_id = 'None' elif op.mode_embed == 'unsupervised': config = {'unsupervised': True, - 'supervised': False} + 'supervised': False, + 'we_type': op.we_type} _config_id = 'M' elif op.mode_embed == 'supervised': config = {'unsupervised': False, - 'supervised': True} + 'supervised': True, + 'we_type': None} _config_id = 'F' elif op.mode_embed == 'both': config = {'unsupervised': True, - 'supervised': True} + 'supervised': True, + 'we_type': op.we_type} _config_id = 'M_and_F' result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') print(f'### PolyEmbedd_andrea_{_config_id}\n') - classifier = AndreaCLF(op.we_path, - config, + classifier = AndreaCLF(we_path=op.we_path, + config=config, first_tier_learner=get_learner(calibrate=True), meta_learner=get_learner(calibrate=False, kernel='rbf'), first_tier_parameters=get_params(dense=False), @@ -114,5 +123,5 @@ if __name__ == '__main__': metrics.append([macrof1, microf1, macrok, microk]) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1], - 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope') + 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 0a7aa4c..0598feb 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -147,7 +147,7 @@ class FastTextWikiNews(Vectors): url_base = 'Cant auto-download MUSE embeddings' path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec' - _name = 'wiki.multi.{}.vec' + _name = '/embeddings/wiki.multi.{}.vec' def __init__(self, cache, language="en", **kwargs): url = self.url_base.format(language) @@ -157,6 +157,30 @@ class FastTextWikiNews(Vectors): super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) +class EmbeddingsAligned(Vectors): + + def __init__(self, type, path, lang): + + self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' + # todo - rewrite as relative path + self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT' + self.path = path + self.name.format(lang) + assert os.path.exists(path), f'pre-trained vectors not found in {path}' + super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path) + + def vocabulary(self): + return set(self.stoi.keys()) + + def dim(self): + return self.dim + + def extract(self, words): + source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi) + extraction = torch.zeros((len(words), self.dim)) + extraction[source_idx] = self.vectors[target_idx] + return extraction + + class FastTextMUSE(PretrainedEmbeddings): def __init__(self, path, lang, limit=None): @@ -179,12 +203,12 @@ class FastTextMUSE(PretrainedEmbeddings): return extraction -def embedding_matrix(path, voc, lang): +def embedding_matrix(type, path, voc, lang): vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0]) print('[embedding matrix]') - print(f'# [pretrained-matrix: FastTextMUSE {lang}]') - pretrained = FastTextMUSE(path, lang) + print(f'# [pretrained-matrix: {type} {lang}]') + pretrained = EmbeddingsAligned(type, path, lang) P = pretrained.extract(vocabulary).numpy() del pretrained print(f'[embedding matrix done] of shape={P.shape}\n') diff --git a/src/learning/learners.py b/src/learning/learners.py index 5a8f07e..d01c734 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -7,8 +7,6 @@ from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer - -from data.supervised import zscores from transformers.StandardizeTransformer import StandardizeTransformer @@ -444,7 +442,8 @@ class AndreaCLF(FunnellingPolylingualClassifier): first_tier_parameters=None, meta_parameters=None, folded_projections=1, - calmode='cal', n_jobs=-1): + calmode='cal', + n_jobs=-1): super().__init__(first_tier_learner, meta_learner, @@ -479,9 +478,8 @@ class AndreaCLF(FunnellingPolylingualClassifier): self.languages.append(lang) tfidf_vectorizer.fit(lX[lang]) lX[lang] = tfidf_vectorizer.transform(lX[lang]) - _sort_if_sparse(lX[lang]) self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ - self.lang_tfidf[lang] = tfidf_vectorizer # utile in fase di testing + self.lang_tfidf[lang] = tfidf_vectorizer return self # @override std class method @@ -517,15 +515,13 @@ class AndreaCLF(FunnellingPolylingualClassifier): if unsupervised: for lang in languages: - # print('Test building embedding matrix FastTextMuse ...') - _, M = embedding_matrix(self.we_path, self.lang_word2idx[lang], lang) + _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang) self.word_embeddings[lang] = M _r[lang] = lX[lang].dot(M) if supervised: for lang in languages: S = WCE_matrix(lX, ly, lang) - # S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging self.supervised_embeddings[lang] = S if unsupervised: _r[lang] = np.hstack((_r[lang], lX[lang].dot(S))) @@ -562,7 +558,7 @@ class AndreaCLF(FunnellingPolylingualClassifier): _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) self.standardizer = StandardizeTransformer() - _vertical_Z = self.standardizer.fit_predict(_vertical_Z) + _vertical_Z = self.standardizer.fit_predict(_vertical_Z) print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, diff --git a/src/results/results.csv b/src/results/results.csv index 783225c..dbef7b3 100644 --- a/src/results/results.csv +++ b/src/results/results.csv @@ -1,7 +1,55 @@ id method learner embed optimp dataset binary languages time lang macrof1 microf1 macrok microk notes -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 en 0.7866666666666666 0.0 0.7927111111111111 -0.0003376325207643527 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 fr 0.7866666666666666 0.0 0.7930666666666667 -0.0001350530083057411 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 en 0.7933333333333333 0.0 0.7931111111111111 -0.00013505300830574107 nope -jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 fr 0.7933333333333333 0.0 0.7933333333333333 0.0 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 it 0.5367684112761455 0.7945344129554656 0.5179685773363333 0.7651326488894972 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 pt 0.6969974938193201 0.878625134264232 0.6967392557377021 0.8466030321042095 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 sv 0.502213941379271 0.7700107543401444 0.4991078326315248 0.7207899075774371 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 es 0.5817849682843411 0.8448214916931778 0.5849433134898768 0.8202407220651875 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 en 0.5284100314545743 0.7625649913344887 0.4968119038332687 0.7152142337789349 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 da 0.4868904596668941 0.7971705872676427 0.4554442856126113 0.741227149968307 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 nl 0.5470546398570723 0.8276762402088773 0.5177281560038681 0.7850292121533595 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 fr 0.4997574965766772 0.7678434382194935 0.4836027981945328 0.7099957841328215 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 de 0.4220457399934653 0.7444316119452236 0.4256936056238835 0.7167749374918141 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 it 0.5398437760931379 0.8008933172994331 0.5146465197929204 0.7584451610463148 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 pt 0.6975279233747671 0.8779959377115775 0.6911573032014029 0.8392738059784555 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 sv 0.5179339368901748 0.7752035065748278 0.4962165022301373 0.7133720895906155 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 es 0.5745246656272296 0.8476464247215235 0.5736797442258523 0.8104027280076678 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 en 0.5265892627601801 0.761854398025736 0.4868823643967914 0.7032312369952987 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 da 0.4857267508065667 0.7955911823647295 0.449682467737542 0.7293013090493592 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 nl 0.5461000743929812 0.8304711580801409 0.5139887576564601 0.7790659402231745 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 fr 0.5015991524998897 0.7699748500677114 0.4811739320459739 0.7065159928392686 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 de 0.4141396160516795 0.743810005053057 0.4126132681585116 0.7023983497130937 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 it 0.4810224709403544 0.7617194410047762 0.453310215598049 0.6999032557458222 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 pt 0.6693663195289151 0.8619702956806105 0.6657298472047529 0.8182397742327547 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 sv 0.43107388787211537 0.7126933954416902 0.4180735239763325 0.6168407376537499 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 es 0.5087201120140917 0.8249322493224932 0.5032299168859704 0.7835086748116167 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 en 0.3822498549987095 0.6877811094452774 0.3309945723997902 0.5962925522774631 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 da 0.4517051377915163 0.7658914728682171 0.4030339299921389 0.6806166833916132 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 nl 0.4875303727964308 0.7853962600178095 0.4534046979963794 0.7270844266398626 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 fr 0.3750315407356979 0.6999393816932714 0.3628389019101708 0.6136670285424017 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 de 0.355059356514748 0.7046466085098807 0.33834564366266284 0.6299245108196094 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 it 0.4755443069888554 0.7675079985780305 0.4501140447119437 0.7023435117413848 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 pt 0.673303227450142 0.8655002733734279 0.6702445967772233 0.8193963705153853 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 sv 0.4189470089118392 0.7236711786068009 0.4198491651634073 0.6314272037990425 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 es 0.5178080058189616 0.8268359020852222 0.5104336022388637 0.782714898784318 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 en 0.4115752894185112 0.7001869158878504 0.35164720517285003 0.6091191993104883 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 da 0.4437869429842064 0.7626499739175796 0.39704879178312197 0.6717100410826179 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 nl 0.47635948919429705 0.7874471399955486 0.4589309165206792 0.7292337019755739 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 fr 0.39374621795002507 0.7063947733122155 0.3850407928528449 0.6315594797194366 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 de 0.3539890425069821 0.7095981751184418 0.3512802070446796 0.6432196317592322 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 it 0.5791455159341481 0.8060849214309596 0.6034752340075125 0.7869853576681214 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 pt 0.6403974389994276 0.8803876562101505 0.6565213830246649 0.8497743924811387 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 sv 0.5032337014290953 0.7768595041322314 0.4719549200388494 0.7364733997369779 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 es 0.5200567247634353 0.8529964145466963 0.4908726477090496 0.8285929531854332 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 en 0.512424485488998 0.7533647963642719 0.4719843960571978 0.7044441169169227 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 da 0.5861231569852233 0.8040595842200032 0.5393761149602847 0.7381233055764151 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 nl 0.6072184716496147 0.8335123523093448 0.5845309357041368 0.8020267337813639 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 fr 0.4923294612439038 0.7854697603651578 0.4713782273939219 0.7329001302478475 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 de 0.4709904181031267 0.7457793804294378 0.4465581491449931 0.7046844416244138 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 it 0.575387626645539 0.8064243448858833 0.5958411838194531 0.7790018114269683 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 pt 0.653004040098633 0.8791937747161628 0.6559210761775208 0.8482450061614855 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 sv 0.49944915222086167 0.7789179104477612 0.4604673876743342 0.727778938054739 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 es 0.5144474487169811 0.8559087767795439 0.48397711649967695 0.8222692824953204 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 en 0.5160737755179508 0.755674709562109 0.45961112517260677 0.6921096138985132 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 da 0.5875776383868945 0.8015873015873016 0.5367286265015276 0.7288571047461061 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 nl 0.6079883230969934 0.8363004776378636 0.5828217771858487 0.7968282071156207 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 fr 0.4966338770370634 0.7860696517412935 0.46250527724325174 0.7292650668002159 nope +rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 de 0.4675732669000923 0.7479187479187479 0.43767984457683634 0.69653035770654 nope diff --git a/src/transformers/StandardizeTransformer.py b/src/transformers/StandardizeTransformer.py index 381d6c1..45921b7 100644 --- a/src/transformers/StandardizeTransformer.py +++ b/src/transformers/StandardizeTransformer.py @@ -20,4 +20,4 @@ class StandardizeTransformer: return (X - self.mean) / self.std def fit_predict(self, X): - return self.fit(X).predict(X) \ No newline at end of file + return self.fit(X).predict(X) From fedc83f84e171f7759720847ca7d0caf8662c6e7 Mon Sep 17 00:00:00 2001 From: andrea Date: Sat, 30 Nov 2019 19:22:48 +0100 Subject: [PATCH 02/10] added col 'emebed_type' in csv results --- src/FPEC_andrea.py | 2 +- src/util/results.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 7092d2b..f8edfad 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -122,6 +122,6 @@ if __name__ == '__main__': macrof1, microf1, macrok, microk = l_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1], + results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1], 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/util/results.py b/src/util/results.py index 43529b4..22e8021 100644 --- a/src/util/results.py +++ b/src/util/results.py @@ -5,7 +5,7 @@ import numpy as np class PolylingualClassificationResults: def __init__(self, file, autoflush=True, verbose=False): self.file = file - self.columns = ['id', 'method', 'learner', 'embed', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] + self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] self.autoflush = autoflush self.verbose = verbose if os.path.exists(file): @@ -20,8 +20,8 @@ class PolylingualClassificationResults: def already_calculated(self, id): return (self.df['id'] == id).any() - def add_row(self, id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) + def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): + s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) self.df = self.df.append(s, ignore_index=True) if self.autoflush: self.flush() self.tell(s.to_string()) From 414e7f151eed6590b20c18035f465692f6be907c Mon Sep 17 00:00:00 2001 From: Andrea Pedrotti Date: Sat, 30 Nov 2019 19:23:39 +0100 Subject: [PATCH 03/10] Delete results.csv --- src/results/results.csv | 55 ----------------------------------------- 1 file changed, 55 deletions(-) delete mode 100644 src/results/results.csv diff --git a/src/results/results.csv b/src/results/results.csv deleted file mode 100644 index dbef7b3..0000000 --- a/src/results/results.csv +++ /dev/null @@ -1,55 +0,0 @@ -id method learner embed optimp dataset binary languages time lang macrof1 microf1 macrok microk notes -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 it 0.5367684112761455 0.7945344129554656 0.5179685773363333 0.7651326488894972 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 pt 0.6969974938193201 0.878625134264232 0.6967392557377021 0.8466030321042095 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 sv 0.502213941379271 0.7700107543401444 0.4991078326315248 0.7207899075774371 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 es 0.5817849682843411 0.8448214916931778 0.5849433134898768 0.8202407220651875 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 en 0.5284100314545743 0.7625649913344887 0.4968119038332687 0.7152142337789349 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 da 0.4868904596668941 0.7971705872676427 0.4554442856126113 0.741227149968307 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 nl 0.5470546398570723 0.8276762402088773 0.5177281560038681 0.7850292121533595 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 fr 0.4997574965766772 0.7678434382194935 0.4836027981945328 0.7099957841328215 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 de 0.4220457399934653 0.7444316119452236 0.4256936056238835 0.7167749374918141 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 it 0.5398437760931379 0.8008933172994331 0.5146465197929204 0.7584451610463148 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 pt 0.6975279233747671 0.8779959377115775 0.6911573032014029 0.8392738059784555 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 sv 0.5179339368901748 0.7752035065748278 0.4962165022301373 0.7133720895906155 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 es 0.5745246656272296 0.8476464247215235 0.5736797442258523 0.8104027280076678 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 en 0.5265892627601801 0.761854398025736 0.4868823643967914 0.7032312369952987 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 da 0.4857267508065667 0.7955911823647295 0.449682467737542 0.7293013090493592 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 nl 0.5461000743929812 0.8304711580801409 0.5139887576564601 0.7790659402231745 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 fr 0.5015991524998897 0.7699748500677114 0.4811739320459739 0.7065159928392686 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 de 0.4141396160516795 0.743810005053057 0.4126132681585116 0.7023983497130937 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 it 0.4810224709403544 0.7617194410047762 0.453310215598049 0.6999032557458222 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 pt 0.6693663195289151 0.8619702956806105 0.6657298472047529 0.8182397742327547 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 sv 0.43107388787211537 0.7126933954416902 0.4180735239763325 0.6168407376537499 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 es 0.5087201120140917 0.8249322493224932 0.5032299168859704 0.7835086748116167 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 en 0.3822498549987095 0.6877811094452774 0.3309945723997902 0.5962925522774631 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 da 0.4517051377915163 0.7658914728682171 0.4030339299921389 0.6806166833916132 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 nl 0.4875303727964308 0.7853962600178095 0.4534046979963794 0.7270844266398626 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 fr 0.3750315407356979 0.6999393816932714 0.3628389019101708 0.6136670285424017 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 de 0.355059356514748 0.7046466085098807 0.33834564366266284 0.6299245108196094 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 it 0.4755443069888554 0.7675079985780305 0.4501140447119437 0.7023435117413848 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 pt 0.673303227450142 0.8655002733734279 0.6702445967772233 0.8193963705153853 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 sv 0.4189470089118392 0.7236711786068009 0.4198491651634073 0.6314272037990425 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 es 0.5178080058189616 0.8268359020852222 0.5104336022388637 0.782714898784318 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 en 0.4115752894185112 0.7001869158878504 0.35164720517285003 0.6091191993104883 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 da 0.4437869429842064 0.7626499739175796 0.39704879178312197 0.6717100410826179 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 nl 0.47635948919429705 0.7874471399955486 0.4589309165206792 0.7292337019755739 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 fr 0.39374621795002507 0.7063947733122155 0.3850407928528449 0.6315594797194366 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 de 0.3539890425069821 0.7095981751184418 0.3512802070446796 0.6432196317592322 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 it 0.5791455159341481 0.8060849214309596 0.6034752340075125 0.7869853576681214 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 pt 0.6403974389994276 0.8803876562101505 0.6565213830246649 0.8497743924811387 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 sv 0.5032337014290953 0.7768595041322314 0.4719549200388494 0.7364733997369779 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 es 0.5200567247634353 0.8529964145466963 0.4908726477090496 0.8285929531854332 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 en 0.512424485488998 0.7533647963642719 0.4719843960571978 0.7044441169169227 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 da 0.5861231569852233 0.8040595842200032 0.5393761149602847 0.7381233055764151 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 nl 0.6072184716496147 0.8335123523093448 0.5845309357041368 0.8020267337813639 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 fr 0.4923294612439038 0.7854697603651578 0.4713782273939219 0.7329001302478475 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 de 0.4709904181031267 0.7457793804294378 0.4465581491449931 0.7046844416244138 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 it 0.575387626645539 0.8064243448858833 0.5958411838194531 0.7790018114269683 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 pt 0.653004040098633 0.8791937747161628 0.6559210761775208 0.8482450061614855 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 sv 0.49944915222086167 0.7789179104477612 0.4604673876743342 0.727778938054739 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 es 0.5144474487169811 0.8559087767795439 0.48397711649967695 0.8222692824953204 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 en 0.5160737755179508 0.755674709562109 0.45961112517260677 0.6921096138985132 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 da 0.5875776383868945 0.8015873015873016 0.5367286265015276 0.7288571047461061 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 nl 0.6079883230969934 0.8363004776378636 0.5828217771858487 0.7968282071156207 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 fr 0.4966338770370634 0.7860696517412935 0.46250527724325174 0.7292650668002159 nope -rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 de 0.4675732669000923 0.7479187479187479 0.43767984457683634 0.69653035770654 nope From e9404e2b8daaa6996ca9255af3de819aa0168008 Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 2 Dec 2019 12:40:39 +0100 Subject: [PATCH 04/10] mask_numbers method --- src/dataset_builder.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/dataset_builder.py b/src/dataset_builder.py index 3f6732c..9af7b3f 100644 --- a/src/dataset_builder.py +++ b/src/dataset_builder.py @@ -11,6 +11,8 @@ import numpy as np from sklearn.model_selection import train_test_split from scipy.sparse import issparse import itertools +from tqdm import tqdm +import re class MultilingualDataset: @@ -73,10 +75,14 @@ class MultilingualDataset: return self.lXte(), self.lYte() def lXtr(self): - return {lang:Xtr for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} + return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if + lang in self.langs()} + # return {lang:self.mask_numbers(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} def lXte(self): - return {lang:Xte for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} + return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if + lang in self.langs()} + # return {lang:self.mask_numbers(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} def lYtr(self): return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} @@ -129,6 +135,13 @@ class MultilingualDataset: def set_labels(self, labels): self.labels = labels + def mask_numbers(self, data, number_mask='numbermask'): + mask = re.compile(r'\b[0-9][0-9.,-]*\b') + masked = [] + for text in tqdm(data, desc='masking numbers'): + masked.append(mask.sub(number_mask, text)) + return masked + # ---------------------------------------------------------------------------------------------------------------------- # Helpers From 4de6b3e2505fa14c4dfdc15d42c7df5e2e0a27e3 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 3 Dec 2019 15:34:12 +0100 Subject: [PATCH 05/10] refactoring emebed method into Class StorageEmbeddings. refactoring class EmbeddingsAligned. tSVD and T-SNE for supervised embeddings --- src/FPEC_andrea.py | 14 ++- src/data/embeddings.py | 101 ++++++++++++++++--- src/data/supervised.py | 25 +++-- src/learning/learners.py | 110 ++++++++++----------- src/transformers/StandardizeTransformer.py | 2 +- 5 files changed, 170 insertions(+), 82 deletions(-) diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index f8edfad..9be7c42 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -11,7 +11,8 @@ from sklearn.svm import SVC parser = OptionParser() parser.add_option("-d", "--dataset", dest="dataset", - help="Path to the multilingual dataset processed and stored in .pickle format") + help="Path to the multilingual dataset processed and stored in .pickle format", + default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") parser.add_option("-o", "--output", dest="output", help="Result file", type=str, default='./results/results.csv') @@ -23,7 +24,7 @@ parser.add_option("-w", "--we-path", dest="we_path", help="Path to the polylingual word embeddings", default='../embeddings/') parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str, - default='FastText') + default='MUSE') parser.add_option("-s", "--set_c", dest="set_c",type=float, help="Set the C parameter", default=1) @@ -36,7 +37,7 @@ parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced') + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') def get_params(dense=False): @@ -64,6 +65,7 @@ if __name__ == '__main__': data.show_dimensions() # data.set_view(languages=['en','it'], categories=list(range(10))) + # data.set_view(languages=['en','it']) lXtr, lytr = data.training() lXte, lyte = data.test() @@ -100,6 +102,10 @@ if __name__ == '__main__': 'we_type': op.we_type} _config_id = 'M_and_F' + ##### TODO - config dict is redundant - we have already op argparse ... + config['reduction'] = 'tSVD' + config['max_label_space'] = 50 + result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') print(f'### PolyEmbedd_andrea_{_config_id}\n') @@ -114,7 +120,7 @@ if __name__ == '__main__': print('# Fitting ...') classifier.fit(lXtr, lytr) - print('# Evaluating ...') + print('\n# Evaluating ...') l_eval = evaluate_method(classifier, lXte, lyte) metrics = [] diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 0598feb..66a14d0 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -5,6 +5,7 @@ from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod from data.supervised import get_supervised_embeddings +from sklearn.decomposition import PCA class PretrainedEmbeddings(ABC): @@ -157,16 +158,41 @@ class FastTextWikiNews(Vectors): super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) +# class EmbeddingsAligned(Vectors): +# +# def __init__(self, type, path, lang): +# +# self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' +# # todo - rewrite as relative path +# self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT' +# self.path = path + self.name.format(lang) +# assert os.path.exists(path), f'pre-trained vectors not found in {path}' +# super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path) +# # self.vectors = self.extract(voc) +# +# def vocabulary(self): +# return set(self.stoi.keys()) +# +# def dim(self): +# return self.dim +# +# def extract(self, words): +# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi) +# extraction = torch.zeros((len(words), self.dim)) +# extraction[source_idx] = self.vectors[target_idx] +# return extraction + + class EmbeddingsAligned(Vectors): - def __init__(self, type, path, lang): - - self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' + def __init__(self, type, path, lang, voc): # todo - rewrite as relative path + self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT' self.path = path + self.name.format(lang) assert os.path.exists(path), f'pre-trained vectors not found in {path}' super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path) + self.vectors = self.extract(voc) def vocabulary(self): return set(self.stoi.keys()) @@ -203,20 +229,69 @@ class FastTextMUSE(PretrainedEmbeddings): return extraction -def embedding_matrix(type, path, voc, lang): - vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0]) +class StorageEmbeddings: + def __init__(self, path): + self.path = path + self.lang_U = dict() + self.lang_S = dict() - print('[embedding matrix]') - print(f'# [pretrained-matrix: {type} {lang}]') - pretrained = EmbeddingsAligned(type, path, lang) - P = pretrained.extract(vocabulary).numpy() - del pretrained - print(f'[embedding matrix done] of shape={P.shape}\n') + def _add_embeddings_unsupervised(self, type, docs, vocs): + for lang in docs.keys(): + print(f'# [unsupervised-matrix {type}] for {lang}') + voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0]) + self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors + print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n') + return - return vocabulary, P + def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space): + for lang in docs.keys(): + print(f'# [supervised-matrix] for {lang}') + # should also pass max_label_space and reduction techniques + self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space) + print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') + return + + def _concatenate_embeddings(self, docs): + _r = dict() + for lang in self.lang_U.keys(): + _r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang]))) + return _r + + def fit(self, config, docs, vocs, labels): + if config['unsupervised']: + self._add_embeddings_unsupervised(config['we_type'], docs, vocs) + if config['supervised']: + self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space']) + return self + + def predict(self, config, docs): + if config['supervised'] and config['unsupervised']: + return self._concatenate_embeddings(docs) + elif config['supervised']: + _r = dict() + for lang in docs.keys(): + _r[lang] = docs[lang].dot(self.lang_S[lang]) + else: + _r = dict() + for lang in docs.keys(): + _r[lang] = docs[lang].dot(self.lang_U[lang]) + return _r -def WCE_matrix(Xtr, Ytr, lang): +# def embedding_matrix(type, path, voc, lang): +# vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x: x[1])))[0]) +# +# print('[embedding matrix]') +# print(f'# [pretrained-matrix: {type} {lang}]') +# pretrained = EmbeddingsAligned(type, path, lang) +# P = pretrained.extract(vocabulary).numpy() +# del pretrained +# print(f'[embedding matrix done] of shape={P.shape}\n') +# +# return vocabulary, P + + +def WCE_matrix(Xtr, Ytr, lang, reduction=None, n_components=50): print('\n# [supervised-matrix]') S = get_supervised_embeddings(Xtr[lang], Ytr[lang]) print(f'[embedding matrix done] of shape={S.shape}\n') diff --git a/src/data/supervised.py b/src/data/supervised.py index 5f97e7f..b3c4fb9 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -1,6 +1,6 @@ from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square -# from util.common import * -from sklearn.decomposition import PCA +from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.manifold import TSNE import numpy as np @@ -40,7 +40,7 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents= return F -def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True): +def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True): print('computing supervised embeddings...') nC = Y.shape[1] @@ -60,10 +60,21 @@ def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_probl F = zscores(F, axis=0) if nC > max_label_space: - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying PCA(n_components={max_label_space})') - pca = PCA(n_components=max_label_space) - F = pca.fit(F).transform(F) + if reduction == 'PCA': + print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + f'Applying PCA(n_components={max_label_space})') + pca = PCA(n_components=max_label_space) + F = pca.fit(F).transform(F) + elif reduction == 'TSNE': + print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + f'Applying t-SNE(n_components={max_label_space})') + tsne = TSNE(n_components=max_label_space) + F = tsne.fit(F).fit_transform(F) + elif reduction == 'tSVD': + print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + f'Applying truncatedSVD(n_components={max_label_space})') + tSVD = TruncatedSVD(n_components=max_label_space) + F = tSVD.fit(F).fit_transform(F) return F diff --git a/src/learning/learners.py b/src/learning/learners.py index d01c734..89bda7e 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -1,6 +1,6 @@ import numpy as np import time -from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix +from data.embeddings import WordEmbeddings, WCE_matrix, StorageEmbeddings from scipy.sparse import issparse from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import GridSearchCV @@ -458,8 +458,9 @@ class AndreaCLF(FunnellingPolylingualClassifier): self.lang_word2idx = dict() self.languages = [] self.lang_tfidf = {} - self.word_embeddings = {} - self.supervised_embeddings = {} + # self.word_embeddings = {} + # self.supervised_embeddings = {} + self.embedding_space = None self.model = None self.time = None @@ -492,42 +493,42 @@ class AndreaCLF(FunnellingPolylingualClassifier): return lZ, lYtr - def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False): - """ - build embedding matrix for given language and returns its weighted sum wrt tf-idf score - """ - _r = dict() - languages = list(lX.keys()) - - if prediction: - for lang in languages: - if unsupervised: # If unsupervised embeddings ... - M = self.word_embeddings[lang] - if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them - S = self.supervised_embeddings[lang] - _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S))) - continue - _r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings - else: # If not unsupervised --> get (S) matrix and its weighted sum - S = self.supervised_embeddings[lang] - _r[lang] = lX[lang].dot(S) - return _r - - if unsupervised: - for lang in languages: - _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang) - self.word_embeddings[lang] = M - _r[lang] = lX[lang].dot(M) - - if supervised: - for lang in languages: - S = WCE_matrix(lX, ly, lang) - self.supervised_embeddings[lang] = S - if unsupervised: - _r[lang] = np.hstack((_r[lang], lX[lang].dot(S))) - else: - _r[lang] = lX[lang].dot(S) - return _r + # def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False): + # """ + # build embedding matrix for given language and returns its weighted sum wrt tf-idf score + # """ + # _r = dict() + # languages = list(lX.keys()) + # + # if prediction: + # for lang in languages: + # if unsupervised: # If unsupervised embeddings ... + # M = self.word_embeddings[lang] + # if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them + # S = self.supervised_embeddings[lang] + # _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S))) + # continue + # _r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings + # else: # If not unsupervised --> get (S) matrix and its weighted sum + # S = self.supervised_embeddings[lang] + # _r[lang] = lX[lang].dot(S) + # return _r + # + # if unsupervised: + # for lang in languages: + # _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang) + # self.word_embeddings[lang] = M + # _r[lang] = lX[lang].dot(M) + # + # if supervised: + # for lang in languages: + # S = WCE_matrix(lX, ly, lang) + # self.supervised_embeddings[lang] = S + # if unsupervised: + # _r[lang] = np.hstack((_r[lang], lX[lang].dot(S))) + # else: + # _r[lang] = lX[lang].dot(S) + # return _r # @override std class method def fit(self, lX, ly): @@ -541,17 +542,11 @@ class AndreaCLF(FunnellingPolylingualClassifier): Z, zy = self._get_zspace(lX, ly) if self.config['supervised'] or self.config['unsupervised']: - # Z vectors is concatenated with doc's embedding weighted sum - Z_embedded = dict() - l_weighted_em = self.embed(lX, ly, - unsupervised=self.config['unsupervised'], - supervised=self.config['supervised']) - - # stacking Z space horizontally with unsupervised (M) and/or supervised (F) embeddings - for lang in list(lX.keys()): - Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang])) - Z = Z_embedded - + self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) + _embedding_space = self.embedding_space.predict(self.config, lX) + # h_stacking posterior probabilities with (U) and/or (S) matrices + for lang in self.languages: + Z[lang] = np.hstack((Z[lang], _embedding_space[lang])) # stacking Z space vertically _vertical_Z = np.vstack([Z[lang] for lang in self.languages]) @@ -573,14 +568,15 @@ class AndreaCLF(FunnellingPolylingualClassifier): lZ = self._projection(self.doc_projector, lX) if self.config['supervised'] or self.config['unsupervised']: - l_weighted_em = self.embed(lX, ly, - unsupervised=self.config['unsupervised'], - supervised=self.config['supervised'], - prediction=True) - Z_embedded = dict() + _embedding_space = self.embedding_space.predict(self.config, lX) + # l_weighted_em = self.embed(lX, ly, + # unsupervised=self.config['unsupervised'], + # supervised=self.config['supervised'], + # prediction=True) + # Z_embedded = dict() for lang in lX.keys(): - Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang])) - lZ = Z_embedded + lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang])) + # lZ = Z_embedded for lang in lZ.keys(): print(lZ[lang].shape) diff --git a/src/transformers/StandardizeTransformer.py b/src/transformers/StandardizeTransformer.py index 45921b7..e776db7 100644 --- a/src/transformers/StandardizeTransformer.py +++ b/src/transformers/StandardizeTransformer.py @@ -12,7 +12,7 @@ class StandardizeTransformer: self.std = np.clip(std, 1e-5, None) self.mean = np.mean(X, axis=self.axis) self.yetfit=True - print('done') + print('done\n') return self def predict(self, X): From f074fd97f92da546cb72a9baa66593f067dd44b3 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 3 Dec 2019 19:57:11 +0100 Subject: [PATCH 06/10] get_optimal_supervised_components method - to be polished --- src/FPEC_andrea.py | 4 ++-- src/data/embeddings.py | 44 +++++++++++++++++++++------------------- src/data/supervised.py | 33 +++++++++++++++++++++++++----- src/learning/learners.py | 39 +---------------------------------- 4 files changed, 54 insertions(+), 66 deletions(-) diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 9be7c42..137e6cc 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -103,8 +103,8 @@ if __name__ == '__main__': _config_id = 'M_and_F' ##### TODO - config dict is redundant - we have already op argparse ... - config['reduction'] = 'tSVD' - config['max_label_space'] = 50 + config['reduction'] = 'PCA' + config['max_label_space'] = 'optimal' result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 66a14d0..d1ad651 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -5,7 +5,6 @@ from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod from data.supervised import get_supervised_embeddings -from sklearn.decomposition import PCA class PretrainedEmbeddings(ABC): @@ -244,10 +243,16 @@ class StorageEmbeddings: return def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space): + _optimal = dict() + # TODO testing optimal max_label_space + if max_label_space == 'optimal': + print('Computing optimal number of PCA components ...') + optimal_n = self.get_optimal_supervised_components(docs, labels) + max_label_space = optimal_n + for lang in docs.keys(): print(f'# [supervised-matrix] for {lang}') - # should also pass max_label_space and reduction techniques - self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space) + self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, lang) print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') return @@ -277,22 +282,19 @@ class StorageEmbeddings: _r[lang] = docs[lang].dot(self.lang_U[lang]) return _r + def get_optimal_supervised_components(self, docs, labels): + _idx = [] + for lang in docs.keys(): + _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist() -# def embedding_matrix(type, path, voc, lang): -# vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x: x[1])))[0]) -# -# print('[embedding matrix]') -# print(f'# [pretrained-matrix: {type} {lang}]') -# pretrained = EmbeddingsAligned(type, path, lang) -# P = pretrained.extract(vocabulary).numpy() -# del pretrained -# print(f'[embedding matrix done] of shape={P.shape}\n') -# -# return vocabulary, P - - -def WCE_matrix(Xtr, Ytr, lang, reduction=None, n_components=50): - print('\n# [supervised-matrix]') - S = get_supervised_embeddings(Xtr[lang], Ytr[lang]) - print(f'[embedding matrix done] of shape={S.shape}\n') - return S + for i in range(len(_r)-1, 1, -1): + # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ... + ratio = _r[i] + next_ratio = _r[i-1] + delta = _r[i] - _r[i-1] + if delta > 0: + # if ratio < next_ratio: + _idx.append(i) + break + best_n = int(sum(_idx)/len(_idx)) + return best_n diff --git a/src/data/supervised.py b/src/data/supervised.py index b3c4fb9..f365dfd 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -40,8 +40,12 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents= return F -def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True): - print('computing supervised embeddings...') +def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True): + if max_label_space == 'optimal': + max_label_space = 0 + + if max_label_space != 0: + print('computing supervised embeddings...') nC = Y.shape[1] if nC==2 and binary_structural_problems > nC: @@ -60,21 +64,40 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_struc F = zscores(F, axis=0) if nC > max_label_space: + # TODO testing optimal max_label_space if reduction == 'PCA': + if max_label_space == 0: + pca = PCA(n_components=Y.shape[1]) + pca = pca.fit(F) + return pca.explained_variance_ratio_ + print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' f'Applying PCA(n_components={max_label_space})') pca = PCA(n_components=max_label_space) - F = pca.fit(F).transform(F) + pca = pca.fit(F) + + ######################################################## + import matplotlib.pyplot as plt + + plt.figure() + plt.plot(np.cumsum(pca.explained_variance_ratio_)) + plt.xlabel('Number of Components') + plt.ylabel('Variance (%)') # + plt.title(f'WCE Explained Variance {lang}') + plt.show() + ######################################################## + + F = pca.fit_transform(F) elif reduction == 'TSNE': print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' f'Applying t-SNE(n_components={max_label_space})') tsne = TSNE(n_components=max_label_space) - F = tsne.fit(F).fit_transform(F) + F = tsne.fit_transform(F) elif reduction == 'tSVD': print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' f'Applying truncatedSVD(n_components={max_label_space})') tSVD = TruncatedSVD(n_components=max_label_space) - F = tSVD.fit(F).fit_transform(F) + F = tSVD.fit_transform(F) return F diff --git a/src/learning/learners.py b/src/learning/learners.py index 89bda7e..aed1094 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -1,6 +1,6 @@ import numpy as np import time -from data.embeddings import WordEmbeddings, WCE_matrix, StorageEmbeddings +from data.embeddings import WordEmbeddings, StorageEmbeddings from scipy.sparse import issparse from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import GridSearchCV @@ -493,43 +493,6 @@ class AndreaCLF(FunnellingPolylingualClassifier): return lZ, lYtr - # def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False): - # """ - # build embedding matrix for given language and returns its weighted sum wrt tf-idf score - # """ - # _r = dict() - # languages = list(lX.keys()) - # - # if prediction: - # for lang in languages: - # if unsupervised: # If unsupervised embeddings ... - # M = self.word_embeddings[lang] - # if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them - # S = self.supervised_embeddings[lang] - # _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S))) - # continue - # _r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings - # else: # If not unsupervised --> get (S) matrix and its weighted sum - # S = self.supervised_embeddings[lang] - # _r[lang] = lX[lang].dot(S) - # return _r - # - # if unsupervised: - # for lang in languages: - # _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang) - # self.word_embeddings[lang] = M - # _r[lang] = lX[lang].dot(M) - # - # if supervised: - # for lang in languages: - # S = WCE_matrix(lX, ly, lang) - # self.supervised_embeddings[lang] = S - # if unsupervised: - # _r[lang] = np.hstack((_r[lang], lX[lang].dot(S))) - # else: - # _r[lang] = lX[lang].dot(S) - # return _r - # @override std class method def fit(self, lX, ly): tinit = time.time() From ba1a72ff9439254d38378367b2a4624d0c5827b7 Mon Sep 17 00:00:00 2001 From: andrea Date: Wed, 4 Dec 2019 10:16:17 +0100 Subject: [PATCH 07/10] Plot variance explained by PCA for every language --- src/data/embeddings.py | 43 +++++++++++++----------------------------- src/data/supervised.py | 17 +++++++---------- 2 files changed, 20 insertions(+), 40 deletions(-) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index d1ad651..b5b253a 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -157,31 +157,6 @@ class FastTextWikiNews(Vectors): super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) -# class EmbeddingsAligned(Vectors): -# -# def __init__(self, type, path, lang): -# -# self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' -# # todo - rewrite as relative path -# self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT' -# self.path = path + self.name.format(lang) -# assert os.path.exists(path), f'pre-trained vectors not found in {path}' -# super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path) -# # self.vectors = self.extract(voc) -# -# def vocabulary(self): -# return set(self.stoi.keys()) -# -# def dim(self): -# return self.dim -# -# def extract(self, words): -# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi) -# extraction = torch.zeros((len(words), self.dim)) -# extraction[source_idx] = self.vectors[target_idx] -# return extraction - - class EmbeddingsAligned(Vectors): def __init__(self, type, path, lang, voc): @@ -283,18 +258,26 @@ class StorageEmbeddings: return _r def get_optimal_supervised_components(self, docs, labels): + import matplotlib.pyplot as plt + _idx = [] + + plt.figure(figsize=(15, 10)) + plt.title(f'WCE Explained Variance') + plt.xlabel('Number of Components') + plt.ylabel('Variance (%)') + for lang in docs.keys(): _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist() - + plt.plot(np.cumsum(_r), label=lang) for i in range(len(_r)-1, 1, -1): # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ... - ratio = _r[i] - next_ratio = _r[i-1] - delta = _r[i] - _r[i-1] + delta = _r[i-1] - _r[i] if delta > 0: - # if ratio < next_ratio: _idx.append(i) break best_n = int(sum(_idx)/len(_idx)) + plt.vlines(best_n, 0, 1, colors='r', label='optimal N') + plt.legend() + plt.show() return best_n diff --git a/src/data/supervised.py b/src/data/supervised.py index f365dfd..02f8c84 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -75,18 +75,15 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', f'Applying PCA(n_components={max_label_space})') pca = PCA(n_components=max_label_space) pca = pca.fit(F) - ######################################################## - import matplotlib.pyplot as plt - - plt.figure() - plt.plot(np.cumsum(pca.explained_variance_ratio_)) - plt.xlabel('Number of Components') - plt.ylabel('Variance (%)') # - plt.title(f'WCE Explained Variance {lang}') - plt.show() + # import matplotlib.pyplot as plt + # plt.figure() + # plt.plot(np.cumsum(pca.explained_variance_ratio_)) + # plt.xlabel('Number of Components') + # plt.ylabel('Variance (%)') # + # plt.title(f'WCE Explained Variance {lang}') + # plt.show() ######################################################## - F = pca.fit_transform(F) elif reduction == 'TSNE': print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' From 509289b26827525ce960149e3dc0a338fd960e0a Mon Sep 17 00:00:00 2001 From: andrea Date: Wed, 4 Dec 2019 13:24:11 +0100 Subject: [PATCH 08/10] Plot variance explained by PCA for every language --- src/FPEC_andrea.py | 2 +- src/data/embeddings.py | 11 ++++++----- src/data/supervised.py | 23 +++++++++++++---------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 137e6cc..185bcc2 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -104,7 +104,7 @@ if __name__ == '__main__': ##### TODO - config dict is redundant - we have already op argparse ... config['reduction'] = 'PCA' - config['max_label_space'] = 'optimal' + config['max_label_space'] = 300 result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') diff --git a/src/data/embeddings.py b/src/data/embeddings.py index b5b253a..8005dad 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -217,7 +217,7 @@ class StorageEmbeddings: print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n') return - def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space): + def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc): _optimal = dict() # TODO testing optimal max_label_space if max_label_space == 'optimal': @@ -227,7 +227,7 @@ class StorageEmbeddings: for lang in docs.keys(): print(f'# [supervised-matrix] for {lang}') - self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, lang) + self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang) print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') return @@ -241,7 +241,7 @@ class StorageEmbeddings: if config['unsupervised']: self._add_embeddings_unsupervised(config['we_type'], docs, vocs) if config['supervised']: - self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space']) + self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs) return self def predict(self, config, docs): @@ -269,10 +269,11 @@ class StorageEmbeddings: for lang in docs.keys(): _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist() - plt.plot(np.cumsum(_r), label=lang) + _r = np.cumsum(_r) + plt.plot(_r, label=lang) for i in range(len(_r)-1, 1, -1): # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ... - delta = _r[i-1] - _r[i] + delta = _r[i] - _r[i-1] if delta > 0: _idx.append(i) break diff --git a/src/data/supervised.py b/src/data/supervised.py index 02f8c84..d8e1f7d 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -40,7 +40,7 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents= return F -def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True): +def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True): if max_label_space == 'optimal': max_label_space = 0 @@ -63,6 +63,18 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', if dozscore: F = zscores(F, axis=0) + # Dumping F-matrix for further studies + # TODO im not sure if voc.keys and F matrix indices are "aligned" correctly + dump_it = True + if dump_it: + with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile: + np.savetxt(outfile, F, delimiter='\t') + with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile: + for token in voc.keys(): + outfile.write(token+'\n') + + + if nC > max_label_space: # TODO testing optimal max_label_space if reduction == 'PCA': @@ -75,15 +87,6 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', f'Applying PCA(n_components={max_label_space})') pca = PCA(n_components=max_label_space) pca = pca.fit(F) - ######################################################## - # import matplotlib.pyplot as plt - # plt.figure() - # plt.plot(np.cumsum(pca.explained_variance_ratio_)) - # plt.xlabel('Number of Components') - # plt.ylabel('Variance (%)') # - # plt.title(f'WCE Explained Variance {lang}') - # plt.show() - ######################################################## F = pca.fit_transform(F) elif reduction == 'TSNE': print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' From 9fa1899a7f1d3f73349bf20909aa0e98596fb31f Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 9 Dec 2019 15:37:52 +0100 Subject: [PATCH 09/10] refactored pca methods --- src/FPEC_andrea.py | 28 +++++--- src/data/embeddings.py | 133 +++++++++++++++++++++++++------------ src/data/supervised.py | 61 ++++++++--------- src/learning/learners.py | 17 ++--- src/util/decompositions.py | 49 ++++++++++++++ src/util/results.py | 6 +- 6 files changed, 199 insertions(+), 95 deletions(-) create mode 100644 src/util/decompositions.py diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 185bcc2..1618c33 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -1,4 +1,4 @@ -import os, sys +import os from dataset_builder import MultilingualDataset from learning.learners import * from util.evaluation import * @@ -21,7 +21,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed", help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none') parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the polylingual word embeddings", default='../embeddings/') + help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/') parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str, default='MUSE') @@ -30,11 +30,21 @@ parser.add_option("-s", "--set_c", dest="set_c",type=float, help="Set the C parameter", default=1) parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimices hyperparameters", default=False) + help="Optimize hyperparameters", default=False) parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, help="Number of parallel jobs (default is -1, all)", default=-1) +parser.add_option("-p", "--pca", dest="max_labels", type=int, + help="If less than number of target classes, will apply PCA to supervised matrix. If set to 0 it" + " will automatically search for the best number of components", default=300) + +parser.add_option("-u", "--upca", dest="max_labels_U", type=int, + help="If smaller than Unsupervised Dimension, will apply PCA to unsupervised matrix. If set to 0 it" + " will automatically search for the best number of components", default=300) + +parser.add_option("-l", dest="lang", type=str) + def get_learner(calibrate=False, kernel='linear'): return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') @@ -51,7 +61,6 @@ def get_params(dense=False): if __name__ == '__main__': - (op, args) = parser.parse_args() assert exists(op.dataset), 'Unable to find file '+str(op.dataset) @@ -64,8 +73,9 @@ if __name__ == '__main__': data = MultilingualDataset.load(op.dataset) data.show_dimensions() - # data.set_view(languages=['en','it'], categories=list(range(10))) - # data.set_view(languages=['en','it']) + data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10))) + # data.set_view(languages=[op.lang]) + # data.set_view(categories=list(range(10))) lXtr, lytr = data.training() lXte, lyte = data.test() @@ -104,7 +114,9 @@ if __name__ == '__main__': ##### TODO - config dict is redundant - we have already op argparse ... config['reduction'] = 'PCA' - config['max_label_space'] = 300 + config['max_label_space'] = op.max_labels + config['dim_reduction_unsupervised'] = op.max_labels_U + # config['plot_covariance_matrices'] = True result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') @@ -129,5 +141,5 @@ if __name__ == '__main__': metrics.append([macrof1, microf1, macrok, microk]) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1], - 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, '') + classifier.time, lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 8005dad..2c02592 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -5,7 +5,9 @@ from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod from data.supervised import get_supervised_embeddings - +import matplotlib.pyplot as plt +from sklearn.decomposition import PCA +from util.decompositions import * class PretrainedEmbeddings(ABC): @@ -110,10 +112,10 @@ class WordEmbeddings: # vocabulary is a set of terms to be kept active_vocabulary = sorted([w for w in vocabulary if w in self.worddim]) lost = len(vocabulary)-len(active_vocabulary) - if lost>0: #some termr are missing, so it will be replaced by UNK + if lost > 0: #some terms are missing, so it will be replaced by UNK print('warning: missing {} terms for lang {}'.format(lost, self.lang)) self.we = self.get_vectors(active_vocabulary) - assert self.we.shape[0]==len(active_vocabulary) + assert self.we.shape[0] == len(active_vocabulary) self.dimword={i:w for i,w in enumerate(active_vocabulary)} self.worddim={w:i for i,w in enumerate(active_vocabulary)} return self @@ -153,7 +155,6 @@ class FastTextWikiNews(Vectors): url = self.url_base.format(language) # name = self.path.format(language) name = cache + self._name.format(language) - # print(f'\n\nFASTEXTWIKI-NEW CLASS:\nurl = {url}\nname = {name}\ncache {cache}\nlanguage = {language}') super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) @@ -171,15 +172,17 @@ class EmbeddingsAligned(Vectors): def vocabulary(self): return set(self.stoi.keys()) - def dim(self): - return self.dim - def extract(self, words): source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi) extraction = torch.zeros((len(words), self.dim)) extraction[source_idx] = self.vectors[target_idx] return extraction + def reduce(self, dim): + pca = PCA(n_components=dim) + self.vectors = pca.fit_transform(self.vectors) + return + class FastTextMUSE(PretrainedEmbeddings): @@ -209,26 +212,44 @@ class StorageEmbeddings: self.lang_U = dict() self.lang_S = dict() - def _add_embeddings_unsupervised(self, type, docs, vocs): + def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300): for lang in docs.keys(): + nC = self.lang_U[lang].shape[1] print(f'# [unsupervised-matrix {type}] for {lang}') voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0]) self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors + # if self.lang_U[lang].shape[1] > dim != 0: + # print(f'unsupervised matrix has more dimensions ({self.lang_U[lang].shape[1]}) than' + # f' the allowed limit {dim}. Applying PCA(n_components={dim})') + # pca = PCA(n_components=dim) + # self.lang_U[lang] = pca.fit_transform(self.lang_U[lang]) print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n') + if max_label_space == 0: + print(f'Computing optimal number of PCA components along matrices U') + optimal_n = get_optimal_dim(self.lang_U, 'U') + self.lang_U = run_pca(optimal_n, self.lang_U) + elif max_label_space < nC: + self.lang_U = run_pca(max_label_space, self.lang_U) + return def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc): - _optimal = dict() - # TODO testing optimal max_label_space - if max_label_space == 'optimal': - print('Computing optimal number of PCA components ...') - optimal_n = self.get_optimal_supervised_components(docs, labels) - max_label_space = optimal_n - - for lang in docs.keys(): + # if max_label_space == 0: + # print('Computing optimal number of PCA components along matrices S...') + # optimal_n = self.get_optimal_supervised_components(docs, labels) + # max_label_space = optimal_n + for lang in docs.keys(): # compute supervised matrices S - then apply PCA + nC = self.lang_S[lang].shape[1] print(f'# [supervised-matrix] for {lang}') self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang) print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') + + if max_label_space == 0: + optimal_n = get_optimal_dim(self.lang_S, 'S') + self.lang_S = run_pca(optimal_n, self.lang_S) + elif max_label_space < nC: + self.lang_S = run_pca(max_label_space, self.lang_S) + return def _concatenate_embeddings(self, docs): @@ -239,7 +260,7 @@ class StorageEmbeddings: def fit(self, config, docs, vocs, labels): if config['unsupervised']: - self._add_embeddings_unsupervised(config['we_type'], docs, vocs) + self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised']) if config['supervised']: self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs) return self @@ -257,28 +278,58 @@ class StorageEmbeddings: _r[lang] = docs[lang].dot(self.lang_U[lang]) return _r - def get_optimal_supervised_components(self, docs, labels): - import matplotlib.pyplot as plt + # @staticmethod + # def get_optimal_supervised_components(docs, labels): + # optimal_n = get_optimal_dim(docs, 'S') + # return optimal_n + # _idx = [] + # + # plt.figure(figsize=(15, 10)) + # plt.title(f'WCE Explained Variance') + # plt.xlabel('Number of Components') + # plt.ylabel('Variance (%)') + # + # for lang in docs.keys(): + # _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist() + # _r = np.cumsum(_r) + # plt.plot(_r, label=lang) + # for i in range(len(_r)-1, 1, -1): + # delta = _r[i] - _r[i-1] + # if delta > 0: + # _idx.append(i) + # break + # best_n = max(_idx) + # plt.axvline(best_n, color='r', label='optimal N') + # plt.legend() + # plt.show() + # return best_n + # + # def get_optimal_unsupervised_components(self, type): + # _idx = [] + # + # plt.figure(figsize=(15, 10)) + # plt.title(f'Unsupervised Embeddings {type} Explained Variance') + # plt.xlabel('Number of Components') + # plt.ylabel('Variance (%)') + # + # for lang in self.lang_U.keys(): + # pca = PCA(n_components=self.lang_U[lang].shape[1]) + # pca.fit(self.lang_U[lang]) + # _r = pca.explained_variance_ratio_ + # _r = np.cumsum(_r) + # plt.plot(_r, label=lang) + # for i in range(len(_r) - 1, 1, -1): + # delta = _r[i] - _r[i - 1] + # if delta > 0: + # _idx.append(i) + # break + # best_n = max(_idx) + # plt.axvline(best_n, color='r', label='optimal N') + # plt.legend() + # plt.show() + # + # for lang in self.lang_U.keys(): + # pca = PCA(n_components=best_n) + # self.lang_U[lang] = pca.fit_transform(self.lang_U[lang]) + # return - _idx = [] - - plt.figure(figsize=(15, 10)) - plt.title(f'WCE Explained Variance') - plt.xlabel('Number of Components') - plt.ylabel('Variance (%)') - - for lang in docs.keys(): - _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist() - _r = np.cumsum(_r) - plt.plot(_r, label=lang) - for i in range(len(_r)-1, 1, -1): - # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ... - delta = _r[i] - _r[i-1] - if delta > 0: - _idx.append(i) - break - best_n = int(sum(_idx)/len(_idx)) - plt.vlines(best_n, 0, 1, colors='r', label='optimal N') - plt.legend() - plt.show() - return best_n diff --git a/src/data/supervised.py b/src/data/supervised.py index d8e1f7d..bbd8c37 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -1,5 +1,5 @@ from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square -from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.decomposition import PCA from sklearn.manifold import TSNE import numpy as np @@ -41,15 +41,9 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents= def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True): - if max_label_space == 'optimal': - max_label_space = 0 - if max_label_space != 0: print('computing supervised embeddings...') - nC = Y.shape[1] - if nC==2 and binary_structural_problems > nC: - raise ValueError('not implemented in this branch') if method=='ppmi': F = supervised_embeddings_ppmi(X, Y) @@ -64,8 +58,7 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la F = zscores(F, axis=0) # Dumping F-matrix for further studies - # TODO im not sure if voc.keys and F matrix indices are "aligned" correctly - dump_it = True + dump_it = False if dump_it: with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile: np.savetxt(outfile, F, delimiter='\t') @@ -73,34 +66,32 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la for token in voc.keys(): outfile.write(token+'\n') - - - if nC > max_label_space: - # TODO testing optimal max_label_space - if reduction == 'PCA': - if max_label_space == 0: - pca = PCA(n_components=Y.shape[1]) - pca = pca.fit(F) - return pca.explained_variance_ratio_ - - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying PCA(n_components={max_label_space})') - pca = PCA(n_components=max_label_space) - pca = pca.fit(F) - F = pca.fit_transform(F) - elif reduction == 'TSNE': - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying t-SNE(n_components={max_label_space})') - tsne = TSNE(n_components=max_label_space) - F = tsne.fit_transform(F) - elif reduction == 'tSVD': - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying truncatedSVD(n_components={max_label_space})') - tSVD = TruncatedSVD(n_components=max_label_space) - F = tSVD.fit_transform(F) - return F + # if nC >= max_label_space: + # if reduction == 'PCA': + # if max_label_space == 0: + # pca = PCA(n_components=Y.shape[1]) + # pca = pca.fit(F) + # return pca.explained_variance_ratio_ + # + # print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + # f'Applying PCA(n_components={max_label_space})') + # pca = PCA(n_components=max_label_space) + # F = pca.fit_transform(F) + # elif reduction == 'TSNE': + # print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + # f'Applying t-SNE(n_components={max_label_space})') + # tsne = TSNE(n_components=max_label_space) + # F = tsne.fit_transform(F) + # elif reduction == 'tSVD': + # print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + # f'Applying truncatedSVD(n_components={max_label_space})') + # tSVD = TruncatedSVD(n_components=max_label_space) + # F = tSVD.fit_transform(F) + # + # return F + diff --git a/src/learning/learners.py b/src/learning/learners.py index aed1094..c4c69fd 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -8,6 +8,7 @@ from sklearn.model_selection import KFold from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer from transformers.StandardizeTransformer import StandardizeTransformer +from sklearn.decomposition import PCA def _sort_if_sparse(X): @@ -453,13 +454,12 @@ class AndreaCLF(FunnellingPolylingualClassifier): calmode, n_jobs) + self.pca_independent_space = PCA(n_components=100) self.we_path = we_path self.config = config self.lang_word2idx = dict() self.languages = [] self.lang_tfidf = {} - # self.word_embeddings = {} - # self.supervised_embeddings = {} self.embedding_space = None self.model = None self.time = None @@ -515,6 +515,10 @@ class AndreaCLF(FunnellingPolylingualClassifier): _vertical_Z = np.vstack([Z[lang] for lang in self.languages]) _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) + # todo testing ... + # self.pca_independent_space.fit(_vertical_Z) + # _vertical_Z = self.pca_independent_space.transform(_vertical_Z) + self.standardizer = StandardizeTransformer() _vertical_Z = self.standardizer.fit_predict(_vertical_Z) @@ -532,17 +536,14 @@ class AndreaCLF(FunnellingPolylingualClassifier): if self.config['supervised'] or self.config['unsupervised']: _embedding_space = self.embedding_space.predict(self.config, lX) - # l_weighted_em = self.embed(lX, ly, - # unsupervised=self.config['unsupervised'], - # supervised=self.config['supervised'], - # prediction=True) - # Z_embedded = dict() + for lang in lX.keys(): lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang])) - # lZ = Z_embedded for lang in lZ.keys(): print(lZ[lang].shape) + # todo testing + # lZ[lang] = self.pca_independent_space.transform(lZ[lang]) lZ[lang] = self.standardizer.predict(lZ[lang]) return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) diff --git a/src/util/decompositions.py b/src/util/decompositions.py new file mode 100644 index 0000000..9029b33 --- /dev/null +++ b/src/util/decompositions.py @@ -0,0 +1,49 @@ +from sklearn.decomposition import PCA +import numpy as np +import matplotlib.pyplot as plt + +def run_pca(dim, X): + """ + :param dim: number of pca components to keep + :param X: dictionary str(lang): matrix + :return: dict lang: reduced matrix + """ + r = dict() + pca = PCA(n_components=dim) + for lang in X.keys(): + r[lang] = pca.fit_transform(X[lang]) + return r + + +def get_optimal_dim(X, embed_type): + """ + :param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised + :param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT) + :return: + """ + _idx = [] + + plt.figure(figsize=(15, 10)) + if embed_type == 'U': + plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance') + else: + plt.title(f'WCE Explained Variance') + plt.xlabel('Number of Components') + plt.ylabel('Variance (%)') + + for lang in X.keys(): + pca = PCA(n_components=X[lang].shape[1]) + pca.fit(X[lang]) + _r = pca.explained_variance_ratio_ + _r = np.cumsum(_r) + plt.plot(_r, label=lang) + for i in range(len(_r) - 1, 1, -1): + delta = _r[i] - _r[i - 1] + if delta > 0: + _idx.append(i) + break + best_n = max(_idx) + plt.axvline(best_n, color='r', label='optimal N') + plt.legend() + plt.show() + return best_n \ No newline at end of file diff --git a/src/util/results.py b/src/util/results.py index 22e8021..7c25bec 100644 --- a/src/util/results.py +++ b/src/util/results.py @@ -5,7 +5,7 @@ import numpy as np class PolylingualClassificationResults: def __init__(self, file, autoflush=True, verbose=False): self.file = file - self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] + self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] self.autoflush = autoflush self.verbose = verbose if os.path.exists(file): @@ -20,8 +20,8 @@ class PolylingualClassificationResults: def already_calculated(self, id): return (self.df['id'] == id).any() - def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) + def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): + s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) self.df = self.df.append(s, ignore_index=True) if self.autoflush: self.flush() self.tell(s.to_string()) From 0c6056e7a13aafdcfe03b6688298533837e03747 Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 9 Dec 2019 15:39:39 +0100 Subject: [PATCH 10/10] refactored pca methods --- src/data/embeddings.py | 69 +++------------------------------------- src/data/supervised.py | 4 +-- src/learning/learners.py | 2 +- 3 files changed, 8 insertions(+), 67 deletions(-) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 2c02592..66e830f 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -1,14 +1,12 @@ import os import pickle -import numpy as np from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod from data.supervised import get_supervised_embeddings -import matplotlib.pyplot as plt -from sklearn.decomposition import PCA from util.decompositions import * + class PretrainedEmbeddings(ABC): def __init__(self): @@ -112,7 +110,7 @@ class WordEmbeddings: # vocabulary is a set of terms to be kept active_vocabulary = sorted([w for w in vocabulary if w in self.worddim]) lost = len(vocabulary)-len(active_vocabulary) - if lost > 0: #some terms are missing, so it will be replaced by UNK + if lost > 0: # some terms are missing, so it will be replaced by UNK print('warning: missing {} terms for lang {}'.format(lost, self.lang)) self.we = self.get_vectors(active_vocabulary) assert self.we.shape[0] == len(active_vocabulary) @@ -134,12 +132,12 @@ class WordEmbeddings: 'instances of {} expected'.format(WordEmbeddings.__name__) polywe = [] - worddim={} - offset=0 + worddim = {} + offset = 0 for we in we_list: polywe.append(we.we) worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()}) - offset=len(worddim) + offset = len(worddim) polywe = np.vstack(polywe) return WordEmbeddings(lang='poly', we=polywe, worddim=worddim) @@ -191,7 +189,6 @@ class FastTextMUSE(PretrainedEmbeddings): print(f'Loading fastText pretrained vectors from {path}') assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') self.embed = FastTextWikiNews(path, lang, max_vectors=limit) - # print('Done') def vocabulary(self): return set(self.embed.stoi.keys()) @@ -277,59 +274,3 @@ class StorageEmbeddings: for lang in docs.keys(): _r[lang] = docs[lang].dot(self.lang_U[lang]) return _r - - # @staticmethod - # def get_optimal_supervised_components(docs, labels): - # optimal_n = get_optimal_dim(docs, 'S') - # return optimal_n - # _idx = [] - # - # plt.figure(figsize=(15, 10)) - # plt.title(f'WCE Explained Variance') - # plt.xlabel('Number of Components') - # plt.ylabel('Variance (%)') - # - # for lang in docs.keys(): - # _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist() - # _r = np.cumsum(_r) - # plt.plot(_r, label=lang) - # for i in range(len(_r)-1, 1, -1): - # delta = _r[i] - _r[i-1] - # if delta > 0: - # _idx.append(i) - # break - # best_n = max(_idx) - # plt.axvline(best_n, color='r', label='optimal N') - # plt.legend() - # plt.show() - # return best_n - # - # def get_optimal_unsupervised_components(self, type): - # _idx = [] - # - # plt.figure(figsize=(15, 10)) - # plt.title(f'Unsupervised Embeddings {type} Explained Variance') - # plt.xlabel('Number of Components') - # plt.ylabel('Variance (%)') - # - # for lang in self.lang_U.keys(): - # pca = PCA(n_components=self.lang_U[lang].shape[1]) - # pca.fit(self.lang_U[lang]) - # _r = pca.explained_variance_ratio_ - # _r = np.cumsum(_r) - # plt.plot(_r, label=lang) - # for i in range(len(_r) - 1, 1, -1): - # delta = _r[i] - _r[i - 1] - # if delta > 0: - # _idx.append(i) - # break - # best_n = max(_idx) - # plt.axvline(best_n, color='r', label='optimal N') - # plt.legend() - # plt.show() - # - # for lang in self.lang_U.keys(): - # pca = PCA(n_components=best_n) - # self.lang_U[lang] = pca.fit_transform(self.lang_U[lang]) - # return - diff --git a/src/data/supervised.py b/src/data/supervised.py index bbd8c37..d2d7aab 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -1,7 +1,7 @@ from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square -from sklearn.decomposition import PCA -from sklearn.manifold import TSNE import numpy as np +# from sklearn.decomposition import PCA +# from sklearn.manifold import TSNE def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur diff --git a/src/learning/learners.py b/src/learning/learners.py index c4c69fd..96e200c 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -8,7 +8,7 @@ from sklearn.model_selection import KFold from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer from transformers.StandardizeTransformer import StandardizeTransformer -from sklearn.decomposition import PCA +# from sklearn.decomposition import PCA def _sort_if_sparse(X):