From f2083bf22a734df8390e37bce7df7b782012c5da Mon Sep 17 00:00:00 2001
From: andrea <andrea.pdr@hotmail.it>
Date: Sat, 30 Nov 2019 19:14:51 +0100
Subject: [PATCH 01/10] reworked unsupervised (aligned) embeddings loader
 method and class (fastText and MUSE). new op.arg -t ['MUSE', 'FastText']
 uploaded /results/results.csv (on rcv1 ... run0.pickle) obtained on all
 available setup. TODO: refactor it also as a standalone class with its own
 load/weighted sum/extract/reduce methods.

---
 src/FPEC_andrea.py                         | 29 +++++++----
 src/data/embeddings.py                     | 32 ++++++++++--
 src/learning/learners.py                   | 14 ++---
 src/results/results.csv                    | 60 +++++++++++++++++++---
 src/transformers/StandardizeTransformer.py |  2 +-
 5 files changed, 107 insertions(+), 30 deletions(-)

diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py
index 4decdf6..7092d2b 100644
--- a/src/FPEC_andrea.py
+++ b/src/FPEC_andrea.py
@@ -17,11 +17,14 @@ parser.add_option("-o", "--output", dest="output",
                   help="Result file", type=str,  default='./results/results.csv')
 
 parser.add_option("-e", "--mode-embed", dest="mode_embed",
-                  help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
+                  help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
 
 parser.add_option("-w", "--we-path", dest="we_path",
                   help="Path to the polylingual word embeddings", default='../embeddings/')
 
+parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
+                  default='FastText')
+
 parser.add_option("-s", "--set_c", dest="set_c",type=float,
                   help="Set the C parameter", default=1)
 
@@ -36,7 +39,7 @@ def get_learner(calibrate=False, kernel='linear'):
     return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced')
 
 
-def get_params(dense=False):    # TODO kernel function could be useful for meta-classifier
+def get_params(dense=False):
     if not op.optimc:
         return None
     c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
@@ -72,30 +75,36 @@ if __name__ == '__main__':
 
     # Embeddings and WCE config
     _available_mode = ['none', 'unsupervised', 'supervised', 'both']
-    assert op.mode_embed in _available_mode , f'{op.mode_embed} not in {_available_mode}'
+    _available_type = ['MUSE', 'FastText']
+    assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}'
+    assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}'
 
     if op.mode_embed == 'none':
         config = {'unsupervised': False,
-                    'supervised': False}
+                  'supervised': False,
+                  'we_type': None}
         _config_id = 'None'
     elif op.mode_embed == 'unsupervised':
         config = {'unsupervised': True,
-                  'supervised': False}
+                  'supervised': False,
+                  'we_type': op.we_type}
         _config_id = 'M'
     elif op.mode_embed == 'supervised':
         config = {'unsupervised': False,
-                  'supervised': True}
+                  'supervised': True,
+                  'we_type': None}
         _config_id = 'F'
     elif op.mode_embed == 'both':
         config = {'unsupervised': True,
-                  'supervised': True}
+                  'supervised': True,
+                  'we_type': op.we_type}
         _config_id = 'M_and_F'
 
     result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
 
     print(f'### PolyEmbedd_andrea_{_config_id}\n')
-    classifier = AndreaCLF(op.we_path,
-                           config,
+    classifier = AndreaCLF(we_path=op.we_path,
+                           config=config,
                            first_tier_learner=get_learner(calibrate=True),
                            meta_learner=get_learner(calibrate=False, kernel='rbf'),
                            first_tier_parameters=get_params(dense=False),
@@ -114,5 +123,5 @@ if __name__ == '__main__':
         metrics.append([macrof1, microf1, macrok, microk])
         print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
         results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1],
-                        'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope')
+                        'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, '')
     print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
diff --git a/src/data/embeddings.py b/src/data/embeddings.py
index 0a7aa4c..0598feb 100644
--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@@ -147,7 +147,7 @@ class FastTextWikiNews(Vectors):
 
     url_base = 'Cant auto-download MUSE embeddings'
     path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
-    _name = 'wiki.multi.{}.vec'
+    _name = '/embeddings/wiki.multi.{}.vec'
 
     def __init__(self, cache, language="en", **kwargs):
         url = self.url_base.format(language)
@@ -157,6 +157,30 @@ class FastTextWikiNews(Vectors):
         super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
 
 
+class EmbeddingsAligned(Vectors):
+
+    def __init__(self, type, path, lang):
+
+        self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
+        # todo - rewrite as relative path
+        self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
+        self.path = path + self.name.format(lang)
+        assert os.path.exists(path), f'pre-trained vectors not found in {path}'
+        super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
+
+    def vocabulary(self):
+        return set(self.stoi.keys())
+
+    def dim(self):
+        return self.dim
+
+    def extract(self, words):
+        source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
+        extraction = torch.zeros((len(words), self.dim))
+        extraction[source_idx] = self.vectors[target_idx]
+        return extraction
+
+
 class FastTextMUSE(PretrainedEmbeddings):
 
     def __init__(self, path, lang, limit=None):
@@ -179,12 +203,12 @@ class FastTextMUSE(PretrainedEmbeddings):
         return extraction
 
 
-def embedding_matrix(path, voc, lang):
+def embedding_matrix(type, path, voc, lang):
     vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0])
 
     print('[embedding matrix]')
-    print(f'# [pretrained-matrix: FastTextMUSE {lang}]')
-    pretrained = FastTextMUSE(path, lang)
+    print(f'# [pretrained-matrix: {type} {lang}]')
+    pretrained = EmbeddingsAligned(type, path, lang)
     P = pretrained.extract(vocabulary).numpy()
     del pretrained
     print(f'[embedding matrix done] of shape={P.shape}\n')
diff --git a/src/learning/learners.py b/src/learning/learners.py
index 5a8f07e..d01c734 100644
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@@ -7,8 +7,6 @@ from sklearn.model_selection import GridSearchCV
 from sklearn.model_selection import KFold
 from joblib import Parallel, delayed
 from sklearn.feature_extraction.text import TfidfVectorizer
-
-from data.supervised import zscores
 from transformers.StandardizeTransformer import StandardizeTransformer
 
 
@@ -444,7 +442,8 @@ class AndreaCLF(FunnellingPolylingualClassifier):
                  first_tier_parameters=None,
                  meta_parameters=None,
                  folded_projections=1,
-                 calmode='cal', n_jobs=-1):
+                 calmode='cal',
+                 n_jobs=-1):
 
         super().__init__(first_tier_learner,
                          meta_learner,
@@ -479,9 +478,8 @@ class AndreaCLF(FunnellingPolylingualClassifier):
             self.languages.append(lang)
             tfidf_vectorizer.fit(lX[lang])
             lX[lang] = tfidf_vectorizer.transform(lX[lang])
-            _sort_if_sparse(lX[lang])
             self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
-            self.lang_tfidf[lang] = tfidf_vectorizer # utile in fase di testing
+            self.lang_tfidf[lang] = tfidf_vectorizer
         return self
 
     # @override std class method
@@ -517,15 +515,13 @@ class AndreaCLF(FunnellingPolylingualClassifier):
 
         if unsupervised:
             for lang in languages:
-                # print('Test building embedding matrix FastTextMuse ...')
-                _, M = embedding_matrix(self.we_path, self.lang_word2idx[lang], lang)
+                _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang)
                 self.word_embeddings[lang] = M
                 _r[lang] = lX[lang].dot(M)
 
         if supervised:
             for lang in languages:
                 S = WCE_matrix(lX, ly, lang)
-                # S = np.squeeze(np.asarray(S))   # casting to ndarray to better visualize S while debugging
                 self.supervised_embeddings[lang] = S
                 if unsupervised:
                     _r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
@@ -562,7 +558,7 @@ class AndreaCLF(FunnellingPolylingualClassifier):
         _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
 
         self.standardizer = StandardizeTransformer()
-        _vertical_Z  = self.standardizer.fit_predict(_vertical_Z)
+        _vertical_Z = self.standardizer.fit_predict(_vertical_Z)
 
         print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
         self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
diff --git a/src/results/results.csv b/src/results/results.csv
index 783225c..dbef7b3 100644
--- a/src/results/results.csv
+++ b/src/results/results.csv
@@ -1,7 +1,55 @@
 id	method	learner	embed	optimp	dataset	binary	languages	time	lang	macrof1	microf1	macrok	microk	notes
-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M	PolyEmbed_andrea	svm	M	False	test_datasetname	not_binary	not_ablation	55.56810355186462	da	0.7933333333333333	0.0	0.7933333333333333	0.0	nope
-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M	PolyEmbed_andrea	svm	M	False	test_datasetname	not_binary	not_ablation	55.56810355186462	en	0.7866666666666666	0.0	0.7927111111111111	-0.0003376325207643527	nope
-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M	PolyEmbed_andrea	svm	M	False	test_datasetname	not_binary	not_ablation	55.56810355186462	fr	0.7866666666666666	0.0	0.7930666666666667	-0.0001350530083057411	nope
-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None	PolyEmbed_andrea	svm	None	False	test_datasetname	not_binary	not_ablation	24.031760931015015	da	0.7933333333333333	0.0	0.7933333333333333	0.0	nope
-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None	PolyEmbed_andrea	svm	None	False	test_datasetname	not_binary	not_ablation	24.031760931015015	en	0.7933333333333333	0.0	0.7931111111111111	-0.00013505300830574107	nope
-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None	PolyEmbed_andrea	svm	None	False	test_datasetname	not_binary	not_ablation	24.031760931015015	fr	0.7933333333333333	0.0	0.7933333333333333	0.0	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	it	0.5367684112761455	0.7945344129554656	0.5179685773363333	0.7651326488894972	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	pt	0.6969974938193201	0.878625134264232	0.6967392557377021	0.8466030321042095	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	sv	0.502213941379271	0.7700107543401444	0.4991078326315248	0.7207899075774371	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	es	0.5817849682843411	0.8448214916931778	0.5849433134898768	0.8202407220651875	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	en	0.5284100314545743	0.7625649913344887	0.4968119038332687	0.7152142337789349	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	da	0.4868904596668941	0.7971705872676427	0.4554442856126113	0.741227149968307	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	nl	0.5470546398570723	0.8276762402088773	0.5177281560038681	0.7850292121533595	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	fr	0.4997574965766772	0.7678434382194935	0.4836027981945328	0.7099957841328215	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	de	0.4220457399934653	0.7444316119452236	0.4256936056238835	0.7167749374918141	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	it	0.5398437760931379	0.8008933172994331	0.5146465197929204	0.7584451610463148	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	pt	0.6975279233747671	0.8779959377115775	0.6911573032014029	0.8392738059784555	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	sv	0.5179339368901748	0.7752035065748278	0.4962165022301373	0.7133720895906155	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	es	0.5745246656272296	0.8476464247215235	0.5736797442258523	0.8104027280076678	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	en	0.5265892627601801	0.761854398025736	0.4868823643967914	0.7032312369952987	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	da	0.4857267508065667	0.7955911823647295	0.449682467737542	0.7293013090493592	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	nl	0.5461000743929812	0.8304711580801409	0.5139887576564601	0.7790659402231745	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	fr	0.5015991524998897	0.7699748500677114	0.4811739320459739	0.7065159928392686	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	de	0.4141396160516795	0.743810005053057	0.4126132681585116	0.7023983497130937	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	it	0.4810224709403544	0.7617194410047762	0.453310215598049	0.6999032557458222	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	pt	0.6693663195289151	0.8619702956806105	0.6657298472047529	0.8182397742327547	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	sv	0.43107388787211537	0.7126933954416902	0.4180735239763325	0.6168407376537499	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	es	0.5087201120140917	0.8249322493224932	0.5032299168859704	0.7835086748116167	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	en	0.3822498549987095	0.6877811094452774	0.3309945723997902	0.5962925522774631	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	da	0.4517051377915163	0.7658914728682171	0.4030339299921389	0.6806166833916132	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	nl	0.4875303727964308	0.7853962600178095	0.4534046979963794	0.7270844266398626	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	fr	0.3750315407356979	0.6999393816932714	0.3628389019101708	0.6136670285424017	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	de	0.355059356514748	0.7046466085098807	0.33834564366266284	0.6299245108196094	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	it	0.4755443069888554	0.7675079985780305	0.4501140447119437	0.7023435117413848	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	pt	0.673303227450142	0.8655002733734279	0.6702445967772233	0.8193963705153853	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	sv	0.4189470089118392	0.7236711786068009	0.4198491651634073	0.6314272037990425	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	es	0.5178080058189616	0.8268359020852222	0.5104336022388637	0.782714898784318	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	en	0.4115752894185112	0.7001869158878504	0.35164720517285003	0.6091191993104883	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	da	0.4437869429842064	0.7626499739175796	0.39704879178312197	0.6717100410826179	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	nl	0.47635948919429705	0.7874471399955486	0.4589309165206792	0.7292337019755739	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	fr	0.39374621795002507	0.7063947733122155	0.3850407928528449	0.6315594797194366	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	de	0.3539890425069821	0.7095981751184418	0.3512802070446796	0.6432196317592322	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	it	0.5791455159341481	0.8060849214309596	0.6034752340075125	0.7869853576681214	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	pt	0.6403974389994276	0.8803876562101505	0.6565213830246649	0.8497743924811387	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	sv	0.5032337014290953	0.7768595041322314	0.4719549200388494	0.7364733997369779	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	es	0.5200567247634353	0.8529964145466963	0.4908726477090496	0.8285929531854332	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	en	0.512424485488998	0.7533647963642719	0.4719843960571978	0.7044441169169227	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	da	0.5861231569852233	0.8040595842200032	0.5393761149602847	0.7381233055764151	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	nl	0.6072184716496147	0.8335123523093448	0.5845309357041368	0.8020267337813639	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	fr	0.4923294612439038	0.7854697603651578	0.4713782273939219	0.7329001302478475	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	de	0.4709904181031267	0.7457793804294378	0.4465581491449931	0.7046844416244138	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	it	0.575387626645539	0.8064243448858833	0.5958411838194531	0.7790018114269683	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	pt	0.653004040098633	0.8791937747161628	0.6559210761775208	0.8482450061614855	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	sv	0.49944915222086167	0.7789179104477612	0.4604673876743342	0.727778938054739	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	es	0.5144474487169811	0.8559087767795439	0.48397711649967695	0.8222692824953204	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	en	0.5160737755179508	0.755674709562109	0.45961112517260677	0.6921096138985132	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	da	0.5875776383868945	0.8015873015873016	0.5367286265015276	0.7288571047461061	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	nl	0.6079883230969934	0.8363004776378636	0.5828217771858487	0.7968282071156207	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	fr	0.4966338770370634	0.7860696517412935	0.46250527724325174	0.7292650668002159	nope
+rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	de	0.4675732669000923	0.7479187479187479	0.43767984457683634	0.69653035770654	nope
diff --git a/src/transformers/StandardizeTransformer.py b/src/transformers/StandardizeTransformer.py
index 381d6c1..45921b7 100644
--- a/src/transformers/StandardizeTransformer.py
+++ b/src/transformers/StandardizeTransformer.py
@@ -20,4 +20,4 @@ class StandardizeTransformer:
         return (X - self.mean) / self.std
 
     def fit_predict(self, X):
-        return self.fit(X).predict(X)
\ No newline at end of file
+        return self.fit(X).predict(X)

From fedc83f84e171f7759720847ca7d0caf8662c6e7 Mon Sep 17 00:00:00 2001
From: andrea <andrea.pdr@hotmail.it>
Date: Sat, 30 Nov 2019 19:22:48 +0100
Subject: [PATCH 02/10] added col 'emebed_type' in csv results

---
 src/FPEC_andrea.py  | 2 +-
 src/util/results.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py
index 7092d2b..f8edfad 100644
--- a/src/FPEC_andrea.py
+++ b/src/FPEC_andrea.py
@@ -122,6 +122,6 @@ if __name__ == '__main__':
         macrof1, microf1, macrok, microk = l_eval[lang]
         metrics.append([macrof1, microf1, macrok, microk])
         print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
-        results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1],
+        results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1],
                         'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, '')
     print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
diff --git a/src/util/results.py b/src/util/results.py
index 43529b4..22e8021 100644
--- a/src/util/results.py
+++ b/src/util/results.py
@@ -5,7 +5,7 @@ import numpy as np
 class PolylingualClassificationResults:
     def __init__(self, file, autoflush=True, verbose=False):
         self.file = file
-        self.columns = ['id', 'method', 'learner', 'embed', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
+        self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
         self.autoflush = autoflush
         self.verbose = verbose
         if os.path.exists(file):
@@ -20,8 +20,8 @@ class PolylingualClassificationResults:
     def already_calculated(self, id):
         return (self.df['id'] == id).any()
 
-    def add_row(self, id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
-        s = pd.Series([id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
+    def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
+        s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
         self.df = self.df.append(s, ignore_index=True)
         if self.autoflush: self.flush()
         self.tell(s.to_string())

From 414e7f151eed6590b20c18035f465692f6be907c Mon Sep 17 00:00:00 2001
From: Andrea Pedrotti <andrea.pdr@hotmail.it>
Date: Sat, 30 Nov 2019 19:23:39 +0100
Subject: [PATCH 03/10] Delete results.csv

---
 src/results/results.csv | 55 -----------------------------------------
 1 file changed, 55 deletions(-)
 delete mode 100644 src/results/results.csv

diff --git a/src/results/results.csv b/src/results/results.csv
deleted file mode 100644
index dbef7b3..0000000
--- a/src/results/results.csv
+++ /dev/null
@@ -1,55 +0,0 @@
-id	method	learner	embed	optimp	dataset	binary	languages	time	lang	macrof1	microf1	macrok	microk	notes
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	it	0.5367684112761455	0.7945344129554656	0.5179685773363333	0.7651326488894972	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	pt	0.6969974938193201	0.878625134264232	0.6967392557377021	0.8466030321042095	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	sv	0.502213941379271	0.7700107543401444	0.4991078326315248	0.7207899075774371	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	es	0.5817849682843411	0.8448214916931778	0.5849433134898768	0.8202407220651875	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	en	0.5284100314545743	0.7625649913344887	0.4968119038332687	0.7152142337789349	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	da	0.4868904596668941	0.7971705872676427	0.4554442856126113	0.741227149968307	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	nl	0.5470546398570723	0.8276762402088773	0.5177281560038681	0.7850292121533595	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	fr	0.4997574965766772	0.7678434382194935	0.4836027981945328	0.7099957841328215	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	161.99278807640076	de	0.4220457399934653	0.7444316119452236	0.4256936056238835	0.7167749374918141	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	it	0.5398437760931379	0.8008933172994331	0.5146465197929204	0.7584451610463148	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	pt	0.6975279233747671	0.8779959377115775	0.6911573032014029	0.8392738059784555	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	sv	0.5179339368901748	0.7752035065748278	0.4962165022301373	0.7133720895906155	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	es	0.5745246656272296	0.8476464247215235	0.5736797442258523	0.8104027280076678	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	en	0.5265892627601801	0.761854398025736	0.4868823643967914	0.7032312369952987	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	da	0.4857267508065667	0.7955911823647295	0.449682467737542	0.7293013090493592	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	nl	0.5461000743929812	0.8304711580801409	0.5139887576564601	0.7790659402231745	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	fr	0.5015991524998897	0.7699748500677114	0.4811739320459739	0.7065159928392686	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	503.81587314605713	de	0.4141396160516795	0.743810005053057	0.4126132681585116	0.7023983497130937	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	it	0.4810224709403544	0.7617194410047762	0.453310215598049	0.6999032557458222	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	pt	0.6693663195289151	0.8619702956806105	0.6657298472047529	0.8182397742327547	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	sv	0.43107388787211537	0.7126933954416902	0.4180735239763325	0.6168407376537499	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	es	0.5087201120140917	0.8249322493224932	0.5032299168859704	0.7835086748116167	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	en	0.3822498549987095	0.6877811094452774	0.3309945723997902	0.5962925522774631	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	da	0.4517051377915163	0.7658914728682171	0.4030339299921389	0.6806166833916132	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	nl	0.4875303727964308	0.7853962600178095	0.4534046979963794	0.7270844266398626	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	fr	0.3750315407356979	0.6999393816932714	0.3628389019101708	0.6136670285424017	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC	PolyEmbed_andrea	svm	F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1095.333437204361	de	0.355059356514748	0.7046466085098807	0.33834564366266284	0.6299245108196094	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	it	0.4755443069888554	0.7675079985780305	0.4501140447119437	0.7023435117413848	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	pt	0.673303227450142	0.8655002733734279	0.6702445967772233	0.8193963705153853	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	sv	0.4189470089118392	0.7236711786068009	0.4198491651634073	0.6314272037990425	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	es	0.5178080058189616	0.8268359020852222	0.5104336022388637	0.782714898784318	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	en	0.4115752894185112	0.7001869158878504	0.35164720517285003	0.6091191993104883	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	da	0.4437869429842064	0.7626499739175796	0.39704879178312197	0.6717100410826179	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	nl	0.47635948919429705	0.7874471399955486	0.4589309165206792	0.7292337019755739	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	fr	0.39374621795002507	0.7063947733122155	0.3850407928528449	0.6315594797194366	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC	PolyEmbed_andrea	svm	M_and_F	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle	not_binary	not_ablation	1251.0414910316467	de	0.3539890425069821	0.7095981751184418	0.3512802070446796	0.6432196317592322	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	it	0.5791455159341481	0.8060849214309596	0.6034752340075125	0.7869853576681214	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	pt	0.6403974389994276	0.8803876562101505	0.6565213830246649	0.8497743924811387	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	sv	0.5032337014290953	0.7768595041322314	0.4719549200388494	0.7364733997369779	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	es	0.5200567247634353	0.8529964145466963	0.4908726477090496	0.8285929531854332	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	en	0.512424485488998	0.7533647963642719	0.4719843960571978	0.7044441169169227	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	da	0.5861231569852233	0.8040595842200032	0.5393761149602847	0.7381233055764151	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	nl	0.6072184716496147	0.8335123523093448	0.5845309357041368	0.8020267337813639	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	fr	0.4923294612439038	0.7854697603651578	0.4713782273939219	0.7329001302478475	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC	PolyEmbed_andrea	svm	None	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	161.2168996334076	de	0.4709904181031267	0.7457793804294378	0.4465581491449931	0.7046844416244138	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	it	0.575387626645539	0.8064243448858833	0.5958411838194531	0.7790018114269683	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	pt	0.653004040098633	0.8791937747161628	0.6559210761775208	0.8482450061614855	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	sv	0.49944915222086167	0.7789179104477612	0.4604673876743342	0.727778938054739	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	es	0.5144474487169811	0.8559087767795439	0.48397711649967695	0.8222692824953204	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	en	0.5160737755179508	0.755674709562109	0.45961112517260677	0.6921096138985132	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	da	0.5875776383868945	0.8015873015873016	0.5367286265015276	0.7288571047461061	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	nl	0.6079883230969934	0.8363004776378636	0.5828217771858487	0.7968282071156207	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	fr	0.4966338770370634	0.7860696517412935	0.46250527724325174	0.7292650668002159	nope
-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC	PolyEmbed_andrea	svm	M	True	rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle	not_binary	not_ablation	497.6823613643646	de	0.4675732669000923	0.7479187479187479	0.43767984457683634	0.69653035770654	nope

From e9404e2b8daaa6996ca9255af3de819aa0168008 Mon Sep 17 00:00:00 2001
From: andrea <andrea.pdr@hotmail.it>
Date: Mon, 2 Dec 2019 12:40:39 +0100
Subject: [PATCH 04/10] mask_numbers method

---
 src/dataset_builder.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/dataset_builder.py b/src/dataset_builder.py
index 3f6732c..9af7b3f 100644
--- a/src/dataset_builder.py
+++ b/src/dataset_builder.py
@@ -11,6 +11,8 @@ import numpy as np
 from sklearn.model_selection import train_test_split
 from scipy.sparse import issparse
 import itertools
+from tqdm import tqdm
+import re
 
 
 class MultilingualDataset:
@@ -73,10 +75,14 @@ class MultilingualDataset:
         return self.lXte(), self.lYte()
 
     def lXtr(self):
-        return {lang:Xtr for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
+        return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if
+                lang in self.langs()}
+        # return {lang:self.mask_numbers(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
 
     def lXte(self):
-        return {lang:Xte for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
+        return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if
+                lang in self.langs()}
+        # return {lang:self.mask_numbers(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
 
     def lYtr(self):
         return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
@@ -129,6 +135,13 @@ class MultilingualDataset:
     def set_labels(self, labels):
         self.labels = labels
 
+    def mask_numbers(self, data, number_mask='numbermask'):
+        mask = re.compile(r'\b[0-9][0-9.,-]*\b')
+        masked = []
+        for text in tqdm(data, desc='masking numbers'):
+            masked.append(mask.sub(number_mask, text))
+        return masked
+
 
 # ----------------------------------------------------------------------------------------------------------------------
 # Helpers

From 4de6b3e2505fa14c4dfdc15d42c7df5e2e0a27e3 Mon Sep 17 00:00:00 2001
From: andrea <andrea.pdr@hotmail.it>
Date: Tue, 3 Dec 2019 15:34:12 +0100
Subject: [PATCH 05/10] refactoring emebed method into Class StorageEmbeddings.
 refactoring class EmbeddingsAligned. tSVD and T-SNE for supervised embeddings

---
 src/FPEC_andrea.py                         |  14 ++-
 src/data/embeddings.py                     | 101 ++++++++++++++++---
 src/data/supervised.py                     |  25 +++--
 src/learning/learners.py                   | 110 ++++++++++-----------
 src/transformers/StandardizeTransformer.py |   2 +-
 5 files changed, 170 insertions(+), 82 deletions(-)

diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py
index f8edfad..9be7c42 100644
--- a/src/FPEC_andrea.py
+++ b/src/FPEC_andrea.py
@@ -11,7 +11,8 @@ from sklearn.svm import SVC
 parser = OptionParser()
 
 parser.add_option("-d", "--dataset", dest="dataset",
-                  help="Path to the multilingual dataset processed and stored in .pickle format")
+                  help="Path to the multilingual dataset processed and stored in .pickle format",
+                  default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
 
 parser.add_option("-o", "--output", dest="output",
                   help="Result file", type=str,  default='./results/results.csv')
@@ -23,7 +24,7 @@ parser.add_option("-w", "--we-path", dest="we_path",
                   help="Path to the polylingual word embeddings", default='../embeddings/')
 
 parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
-                  default='FastText')
+                  default='MUSE')
 
 parser.add_option("-s", "--set_c", dest="set_c",type=float,
                   help="Set the C parameter", default=1)
@@ -36,7 +37,7 @@ parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
 
 
 def get_learner(calibrate=False, kernel='linear'):
-    return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced')
+    return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
 
 
 def get_params(dense=False):
@@ -64,6 +65,7 @@ if __name__ == '__main__':
     data.show_dimensions()
 
     # data.set_view(languages=['en','it'], categories=list(range(10)))
+    # data.set_view(languages=['en','it'])
     lXtr, lytr = data.training()
     lXte, lyte = data.test()
 
@@ -100,6 +102,10 @@ if __name__ == '__main__':
                   'we_type': op.we_type}
         _config_id = 'M_and_F'
 
+    ##### TODO - config dict is redundant - we have already op argparse ...
+    config['reduction'] = 'tSVD'
+    config['max_label_space'] = 50
+
     result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
 
     print(f'### PolyEmbedd_andrea_{_config_id}\n')
@@ -114,7 +120,7 @@ if __name__ == '__main__':
     print('# Fitting ...')
     classifier.fit(lXtr, lytr)
 
-    print('# Evaluating ...')
+    print('\n# Evaluating ...')
     l_eval = evaluate_method(classifier, lXte, lyte)
 
     metrics = []
diff --git a/src/data/embeddings.py b/src/data/embeddings.py
index 0598feb..66a14d0 100644
--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@@ -5,6 +5,7 @@ from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
 from data.supervised import get_supervised_embeddings
+from sklearn.decomposition import PCA
 
 
 class PretrainedEmbeddings(ABC):
@@ -157,16 +158,41 @@ class FastTextWikiNews(Vectors):
         super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
 
 
+# class EmbeddingsAligned(Vectors):
+#
+#     def __init__(self, type, path, lang):
+#
+#         self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
+#         # todo - rewrite as relative path
+#         self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
+#         self.path = path + self.name.format(lang)
+#         assert os.path.exists(path), f'pre-trained vectors not found in {path}'
+#         super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
+#         # self.vectors = self.extract(voc)
+#
+#     def vocabulary(self):
+#         return set(self.stoi.keys())
+#
+#     def dim(self):
+#         return self.dim
+#
+#     def extract(self, words):
+#         source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
+#         extraction = torch.zeros((len(words), self.dim))
+#         extraction[source_idx] = self.vectors[target_idx]
+#         return extraction
+
+
 class EmbeddingsAligned(Vectors):
 
-    def __init__(self, type, path, lang):
-
-        self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
+    def __init__(self, type, path, lang, voc):
         # todo - rewrite as relative path
+        self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
         self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
         self.path = path + self.name.format(lang)
         assert os.path.exists(path), f'pre-trained vectors not found in {path}'
         super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
+        self.vectors = self.extract(voc)
 
     def vocabulary(self):
         return set(self.stoi.keys())
@@ -203,20 +229,69 @@ class FastTextMUSE(PretrainedEmbeddings):
         return extraction
 
 
-def embedding_matrix(type, path, voc, lang):
-    vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0])
+class StorageEmbeddings:
+    def __init__(self, path):
+        self.path = path
+        self.lang_U = dict()
+        self.lang_S = dict()
 
-    print('[embedding matrix]')
-    print(f'# [pretrained-matrix: {type} {lang}]')
-    pretrained = EmbeddingsAligned(type, path, lang)
-    P = pretrained.extract(vocabulary).numpy()
-    del pretrained
-    print(f'[embedding matrix done] of shape={P.shape}\n')
+    def _add_embeddings_unsupervised(self, type, docs, vocs):
+        for lang in docs.keys():
+            print(f'# [unsupervised-matrix {type}] for {lang}')
+            voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
+            self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
+            print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
+        return
 
-    return vocabulary, P
+    def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space):
+        for lang in docs.keys():
+            print(f'# [supervised-matrix] for {lang}')
+            # should also pass max_label_space and reduction techniques
+            self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space)
+            print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
+        return
+
+    def _concatenate_embeddings(self, docs):
+        _r = dict()
+        for lang in self.lang_U.keys():
+            _r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang])))
+        return _r
+
+    def fit(self, config, docs, vocs, labels):
+        if config['unsupervised']:
+            self._add_embeddings_unsupervised(config['we_type'], docs, vocs)
+        if config['supervised']:
+            self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'])
+        return self
+
+    def predict(self, config, docs):
+        if config['supervised'] and config['unsupervised']:
+            return self._concatenate_embeddings(docs)
+        elif config['supervised']:
+            _r = dict()
+            for lang in docs.keys():
+                _r[lang] = docs[lang].dot(self.lang_S[lang])
+        else:
+            _r = dict()
+            for lang in docs.keys():
+                _r[lang] = docs[lang].dot(self.lang_U[lang])
+        return _r
 
 
-def WCE_matrix(Xtr, Ytr, lang):
+# def embedding_matrix(type, path, voc, lang):
+#     vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x: x[1])))[0])
+#
+#     print('[embedding matrix]')
+#     print(f'# [pretrained-matrix: {type} {lang}]')
+#     pretrained = EmbeddingsAligned(type, path, lang)
+#     P = pretrained.extract(vocabulary).numpy()
+#     del pretrained
+#     print(f'[embedding matrix done] of shape={P.shape}\n')
+#
+#     return vocabulary, P
+
+
+def WCE_matrix(Xtr, Ytr, lang, reduction=None, n_components=50):
     print('\n# [supervised-matrix]')
     S = get_supervised_embeddings(Xtr[lang], Ytr[lang])
     print(f'[embedding matrix done] of shape={S.shape}\n')
diff --git a/src/data/supervised.py b/src/data/supervised.py
index 5f97e7f..b3c4fb9 100755
--- a/src/data/supervised.py
+++ b/src/data/supervised.py
@@ -1,6 +1,6 @@
 from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
-# from util.common import *
-from sklearn.decomposition import PCA
+from sklearn.decomposition import PCA, TruncatedSVD
+from sklearn.manifold import TSNE
 import numpy as np
 
 
@@ -40,7 +40,7 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
     return F
 
 
-def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
+def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
     print('computing supervised embeddings...')
 
     nC = Y.shape[1]
@@ -60,10 +60,21 @@ def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_probl
         F = zscores(F, axis=0)
 
     if nC > max_label_space:
-        print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
-              f'Applying PCA(n_components={max_label_space})')
-        pca = PCA(n_components=max_label_space)
-        F = pca.fit(F).transform(F)
+        if reduction == 'PCA':
+            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
+                  f'Applying PCA(n_components={max_label_space})')
+            pca = PCA(n_components=max_label_space)
+            F = pca.fit(F).transform(F)
+        elif reduction == 'TSNE':
+            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
+                  f'Applying t-SNE(n_components={max_label_space})')
+            tsne = TSNE(n_components=max_label_space)
+            F = tsne.fit(F).fit_transform(F)
+        elif reduction == 'tSVD':
+            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
+                  f'Applying truncatedSVD(n_components={max_label_space})')
+            tSVD = TruncatedSVD(n_components=max_label_space)
+            F = tSVD.fit(F).fit_transform(F)
 
     return F
 
diff --git a/src/learning/learners.py b/src/learning/learners.py
index d01c734..89bda7e 100644
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@@ -1,6 +1,6 @@
 import numpy as np
 import time
-from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix
+from data.embeddings import WordEmbeddings, WCE_matrix, StorageEmbeddings
 from scipy.sparse import issparse
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.model_selection import GridSearchCV
@@ -458,8 +458,9 @@ class AndreaCLF(FunnellingPolylingualClassifier):
         self.lang_word2idx = dict()
         self.languages = []
         self.lang_tfidf = {}
-        self.word_embeddings = {}
-        self.supervised_embeddings = {}
+        # self.word_embeddings = {}
+        # self.supervised_embeddings = {}
+        self.embedding_space = None
         self.model = None
         self.time = None
 
@@ -492,42 +493,42 @@ class AndreaCLF(FunnellingPolylingualClassifier):
 
         return lZ, lYtr
 
-    def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
-        """
-        build embedding matrix for given language and returns its weighted sum wrt tf-idf score
-        """
-        _r = dict()
-        languages = list(lX.keys())
-
-        if prediction:
-            for lang in languages:
-                if unsupervised:    # If unsupervised embeddings ...
-                    M = self.word_embeddings[lang]
-                    if supervised:  # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
-                        S = self.supervised_embeddings[lang]
-                        _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
-                        continue
-                    _r[lang] = lX[lang].dot(M)  # if not supervised --> just get weighted sum of unsupervised (M) embeddings
-                else:   # If not unsupervised --> get (S) matrix and its weighted sum 
-                    S = self.supervised_embeddings[lang]
-                    _r[lang] = lX[lang].dot(S)
-            return _r
-
-        if unsupervised:
-            for lang in languages:
-                _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang)
-                self.word_embeddings[lang] = M
-                _r[lang] = lX[lang].dot(M)
-
-        if supervised:
-            for lang in languages:
-                S = WCE_matrix(lX, ly, lang)
-                self.supervised_embeddings[lang] = S
-                if unsupervised:
-                    _r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
-                else:
-                    _r[lang] = lX[lang].dot(S)
-        return _r
+    # def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
+    #     """
+    #     build embedding matrix for given language and returns its weighted sum wrt tf-idf score
+    #     """
+    #     _r = dict()
+    #     languages = list(lX.keys())
+    #
+    #     if prediction:
+    #         for lang in languages:
+    #             if unsupervised:    # If unsupervised embeddings ...
+    #                 M = self.word_embeddings[lang]
+    #                 if supervised:  # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
+    #                     S = self.supervised_embeddings[lang]
+    #                     _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
+    #                     continue
+    #                 _r[lang] = lX[lang].dot(M)  # if not supervised --> just get weighted sum of unsupervised (M) embeddings
+    #             else:   # If not unsupervised --> get (S) matrix and its weighted sum
+    #                 S = self.supervised_embeddings[lang]
+    #                 _r[lang] = lX[lang].dot(S)
+    #         return _r
+    #
+    #     if unsupervised:
+    #         for lang in languages:
+    #             _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang)
+    #             self.word_embeddings[lang] = M
+    #             _r[lang] = lX[lang].dot(M)
+    #
+    #     if supervised:
+    #         for lang in languages:
+    #             S = WCE_matrix(lX, ly, lang)
+    #             self.supervised_embeddings[lang] = S
+    #             if unsupervised:
+    #                 _r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
+    #             else:
+    #                 _r[lang] = lX[lang].dot(S)
+    #     return _r
 
     # @override std class method
     def fit(self, lX, ly):
@@ -541,17 +542,11 @@ class AndreaCLF(FunnellingPolylingualClassifier):
         Z, zy = self._get_zspace(lX, ly)
 
         if self.config['supervised'] or self.config['unsupervised']:
-            # Z vectors is concatenated with doc's embedding weighted sum
-            Z_embedded = dict()
-            l_weighted_em = self.embed(lX, ly,
-                                       unsupervised=self.config['unsupervised'],
-                                       supervised=self.config['supervised'])
-
-            # stacking Z space horizontally with unsupervised (M) and/or supervised (F) embeddings
-            for lang in list(lX.keys()):
-                Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
-            Z = Z_embedded
-
+            self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
+            _embedding_space = self.embedding_space.predict(self.config, lX)
+            # h_stacking posterior probabilities with (U) and/or (S) matrices
+            for lang in self.languages:
+                Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
 
         # stacking Z space vertically
         _vertical_Z = np.vstack([Z[lang] for lang in self.languages])
@@ -573,14 +568,15 @@ class AndreaCLF(FunnellingPolylingualClassifier):
         lZ = self._projection(self.doc_projector, lX)
 
         if self.config['supervised'] or self.config['unsupervised']:
-            l_weighted_em = self.embed(lX, ly,
-                                       unsupervised=self.config['unsupervised'],
-                                       supervised=self.config['supervised'],
-                                       prediction=True)
-            Z_embedded = dict()
+            _embedding_space = self.embedding_space.predict(self.config, lX)
+            # l_weighted_em = self.embed(lX, ly,
+            #                            unsupervised=self.config['unsupervised'],
+            #                            supervised=self.config['supervised'],
+            #                            prediction=True)
+            # Z_embedded = dict()
             for lang in lX.keys():
-                Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang]))
-            lZ = Z_embedded
+                lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
+            # lZ = Z_embedded
 
         for lang in lZ.keys():
             print(lZ[lang].shape)
diff --git a/src/transformers/StandardizeTransformer.py b/src/transformers/StandardizeTransformer.py
index 45921b7..e776db7 100644
--- a/src/transformers/StandardizeTransformer.py
+++ b/src/transformers/StandardizeTransformer.py
@@ -12,7 +12,7 @@ class StandardizeTransformer:
         self.std = np.clip(std, 1e-5, None)
         self.mean = np.mean(X, axis=self.axis)
         self.yetfit=True
-        print('done')
+        print('done\n')
         return self
 
     def predict(self, X):

From f074fd97f92da546cb72a9baa66593f067dd44b3 Mon Sep 17 00:00:00 2001
From: andrea <andrea.pdr@hotmail.it>
Date: Tue, 3 Dec 2019 19:57:11 +0100
Subject: [PATCH 06/10] get_optimal_supervised_components method - to be
 polished

---
 src/FPEC_andrea.py       |  4 ++--
 src/data/embeddings.py   | 44 +++++++++++++++++++++-------------------
 src/data/supervised.py   | 33 +++++++++++++++++++++++++-----
 src/learning/learners.py | 39 +----------------------------------
 4 files changed, 54 insertions(+), 66 deletions(-)

diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py
index 9be7c42..137e6cc 100644
--- a/src/FPEC_andrea.py
+++ b/src/FPEC_andrea.py
@@ -103,8 +103,8 @@ if __name__ == '__main__':
         _config_id = 'M_and_F'
 
     ##### TODO - config dict is redundant - we have already op argparse ...
-    config['reduction'] = 'tSVD'
-    config['max_label_space'] = 50
+    config['reduction'] = 'PCA'
+    config['max_label_space'] = 'optimal'
 
     result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
 
diff --git a/src/data/embeddings.py b/src/data/embeddings.py
index 66a14d0..d1ad651 100644
--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@@ -5,7 +5,6 @@ from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
 from data.supervised import get_supervised_embeddings
-from sklearn.decomposition import PCA
 
 
 class PretrainedEmbeddings(ABC):
@@ -244,10 +243,16 @@ class StorageEmbeddings:
         return
 
     def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space):
+        _optimal = dict()
+        # TODO testing optimal max_label_space
+        if max_label_space == 'optimal':
+            print('Computing optimal number of PCA components ...')
+            optimal_n = self.get_optimal_supervised_components(docs, labels)
+            max_label_space = optimal_n
+
         for lang in docs.keys():
             print(f'# [supervised-matrix] for {lang}')
-            # should also pass max_label_space and reduction techniques
-            self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space)
+            self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, lang)
             print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
         return
 
@@ -277,22 +282,19 @@ class StorageEmbeddings:
                 _r[lang] = docs[lang].dot(self.lang_U[lang])
         return _r
 
+    def get_optimal_supervised_components(self, docs, labels):
+        _idx = []
+        for lang in docs.keys():
+            _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist()
 
-# def embedding_matrix(type, path, voc, lang):
-#     vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x: x[1])))[0])
-#
-#     print('[embedding matrix]')
-#     print(f'# [pretrained-matrix: {type} {lang}]')
-#     pretrained = EmbeddingsAligned(type, path, lang)
-#     P = pretrained.extract(vocabulary).numpy()
-#     del pretrained
-#     print(f'[embedding matrix done] of shape={P.shape}\n')
-#
-#     return vocabulary, P
-
-
-def WCE_matrix(Xtr, Ytr, lang, reduction=None, n_components=50):
-    print('\n# [supervised-matrix]')
-    S = get_supervised_embeddings(Xtr[lang], Ytr[lang])
-    print(f'[embedding matrix done] of shape={S.shape}\n')
-    return S
+            for i in range(len(_r)-1, 1, -1):
+                # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ...
+                ratio = _r[i]
+                next_ratio = _r[i-1]
+                delta = _r[i] - _r[i-1]
+                if delta > 0:
+                # if ratio < next_ratio:
+                    _idx.append(i)
+                    break
+        best_n = int(sum(_idx)/len(_idx))
+        return best_n
diff --git a/src/data/supervised.py b/src/data/supervised.py
index b3c4fb9..f365dfd 100755
--- a/src/data/supervised.py
+++ b/src/data/supervised.py
@@ -40,8 +40,12 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
     return F
 
 
-def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
-    print('computing supervised embeddings...')
+def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
+    if max_label_space == 'optimal':
+        max_label_space = 0
+
+    if max_label_space != 0:
+        print('computing supervised embeddings...')
 
     nC = Y.shape[1]
     if nC==2 and binary_structural_problems > nC:
@@ -60,21 +64,40 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_struc
         F = zscores(F, axis=0)
 
     if nC > max_label_space:
+        # TODO testing optimal max_label_space
         if reduction == 'PCA':
+            if max_label_space == 0:
+                pca = PCA(n_components=Y.shape[1])
+                pca = pca.fit(F)
+                return pca.explained_variance_ratio_
+
             print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
                   f'Applying PCA(n_components={max_label_space})')
             pca = PCA(n_components=max_label_space)
-            F = pca.fit(F).transform(F)
+            pca = pca.fit(F)
+
+            ########################################################
+            import matplotlib.pyplot as plt
+
+            plt.figure()
+            plt.plot(np.cumsum(pca.explained_variance_ratio_))
+            plt.xlabel('Number of Components')
+            plt.ylabel('Variance (%)')  #
+            plt.title(f'WCE Explained Variance {lang}')
+            plt.show()
+            ########################################################
+
+            F = pca.fit_transform(F)
         elif reduction == 'TSNE':
             print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
                   f'Applying t-SNE(n_components={max_label_space})')
             tsne = TSNE(n_components=max_label_space)
-            F = tsne.fit(F).fit_transform(F)
+            F = tsne.fit_transform(F)
         elif reduction == 'tSVD':
             print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
                   f'Applying truncatedSVD(n_components={max_label_space})')
             tSVD = TruncatedSVD(n_components=max_label_space)
-            F = tSVD.fit(F).fit_transform(F)
+            F = tSVD.fit_transform(F)
 
     return F
 
diff --git a/src/learning/learners.py b/src/learning/learners.py
index 89bda7e..aed1094 100644
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@@ -1,6 +1,6 @@
 import numpy as np
 import time
-from data.embeddings import WordEmbeddings, WCE_matrix, StorageEmbeddings
+from data.embeddings import WordEmbeddings, StorageEmbeddings
 from scipy.sparse import issparse
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.model_selection import GridSearchCV
@@ -493,43 +493,6 @@ class AndreaCLF(FunnellingPolylingualClassifier):
 
         return lZ, lYtr
 
-    # def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
-    #     """
-    #     build embedding matrix for given language and returns its weighted sum wrt tf-idf score
-    #     """
-    #     _r = dict()
-    #     languages = list(lX.keys())
-    #
-    #     if prediction:
-    #         for lang in languages:
-    #             if unsupervised:    # If unsupervised embeddings ...
-    #                 M = self.word_embeddings[lang]
-    #                 if supervised:  # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
-    #                     S = self.supervised_embeddings[lang]
-    #                     _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
-    #                     continue
-    #                 _r[lang] = lX[lang].dot(M)  # if not supervised --> just get weighted sum of unsupervised (M) embeddings
-    #             else:   # If not unsupervised --> get (S) matrix and its weighted sum
-    #                 S = self.supervised_embeddings[lang]
-    #                 _r[lang] = lX[lang].dot(S)
-    #         return _r
-    #
-    #     if unsupervised:
-    #         for lang in languages:
-    #             _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang)
-    #             self.word_embeddings[lang] = M
-    #             _r[lang] = lX[lang].dot(M)
-    #
-    #     if supervised:
-    #         for lang in languages:
-    #             S = WCE_matrix(lX, ly, lang)
-    #             self.supervised_embeddings[lang] = S
-    #             if unsupervised:
-    #                 _r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
-    #             else:
-    #                 _r[lang] = lX[lang].dot(S)
-    #     return _r
-
     # @override std class method
     def fit(self, lX, ly):
         tinit = time.time()

From ba1a72ff9439254d38378367b2a4624d0c5827b7 Mon Sep 17 00:00:00 2001
From: andrea <andrea.pdr@hotmail.it>
Date: Wed, 4 Dec 2019 10:16:17 +0100
Subject: [PATCH 07/10] Plot variance explained by PCA for every language

---
 src/data/embeddings.py | 43 +++++++++++++-----------------------------
 src/data/supervised.py | 17 +++++++----------
 2 files changed, 20 insertions(+), 40 deletions(-)

diff --git a/src/data/embeddings.py b/src/data/embeddings.py
index d1ad651..b5b253a 100644
--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@@ -157,31 +157,6 @@ class FastTextWikiNews(Vectors):
         super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
 
 
-# class EmbeddingsAligned(Vectors):
-#
-#     def __init__(self, type, path, lang):
-#
-#         self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
-#         # todo - rewrite as relative path
-#         self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
-#         self.path = path + self.name.format(lang)
-#         assert os.path.exists(path), f'pre-trained vectors not found in {path}'
-#         super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
-#         # self.vectors = self.extract(voc)
-#
-#     def vocabulary(self):
-#         return set(self.stoi.keys())
-#
-#     def dim(self):
-#         return self.dim
-#
-#     def extract(self, words):
-#         source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
-#         extraction = torch.zeros((len(words), self.dim))
-#         extraction[source_idx] = self.vectors[target_idx]
-#         return extraction
-
-
 class EmbeddingsAligned(Vectors):
 
     def __init__(self, type, path, lang, voc):
@@ -283,18 +258,26 @@ class StorageEmbeddings:
         return _r
 
     def get_optimal_supervised_components(self, docs, labels):
+        import matplotlib.pyplot as plt
+
         _idx = []
+
+        plt.figure(figsize=(15, 10))
+        plt.title(f'WCE Explained Variance')
+        plt.xlabel('Number of Components')
+        plt.ylabel('Variance (%)')
+
         for lang in docs.keys():
             _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist()
-
+            plt.plot(np.cumsum(_r), label=lang)
             for i in range(len(_r)-1, 1, -1):
                 # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ...
-                ratio = _r[i]
-                next_ratio = _r[i-1]
-                delta = _r[i] - _r[i-1]
+                delta = _r[i-1] - _r[i]
                 if delta > 0:
-                # if ratio < next_ratio:
                     _idx.append(i)
                     break
         best_n = int(sum(_idx)/len(_idx))
+        plt.vlines(best_n, 0, 1, colors='r', label='optimal N')
+        plt.legend()
+        plt.show()
         return best_n
diff --git a/src/data/supervised.py b/src/data/supervised.py
index f365dfd..02f8c84 100755
--- a/src/data/supervised.py
+++ b/src/data/supervised.py
@@ -75,18 +75,15 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None',
                   f'Applying PCA(n_components={max_label_space})')
             pca = PCA(n_components=max_label_space)
             pca = pca.fit(F)
-
             ########################################################
-            import matplotlib.pyplot as plt
-
-            plt.figure()
-            plt.plot(np.cumsum(pca.explained_variance_ratio_))
-            plt.xlabel('Number of Components')
-            plt.ylabel('Variance (%)')  #
-            plt.title(f'WCE Explained Variance {lang}')
-            plt.show()
+            # import matplotlib.pyplot as plt
+            # plt.figure()
+            # plt.plot(np.cumsum(pca.explained_variance_ratio_))
+            # plt.xlabel('Number of Components')
+            # plt.ylabel('Variance (%)')  #
+            # plt.title(f'WCE Explained Variance {lang}')
+            # plt.show()
             ########################################################
-
             F = pca.fit_transform(F)
         elif reduction == 'TSNE':
             print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '

From 509289b26827525ce960149e3dc0a338fd960e0a Mon Sep 17 00:00:00 2001
From: andrea <andrea.pdr@hotmail.it>
Date: Wed, 4 Dec 2019 13:24:11 +0100
Subject: [PATCH 08/10] Plot variance explained by PCA for every language

---
 src/FPEC_andrea.py     |  2 +-
 src/data/embeddings.py | 11 ++++++-----
 src/data/supervised.py | 23 +++++++++++++----------
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py
index 137e6cc..185bcc2 100644
--- a/src/FPEC_andrea.py
+++ b/src/FPEC_andrea.py
@@ -104,7 +104,7 @@ if __name__ == '__main__':
 
     ##### TODO - config dict is redundant - we have already op argparse ...
     config['reduction'] = 'PCA'
-    config['max_label_space'] = 'optimal'
+    config['max_label_space'] = 300
 
     result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
 
diff --git a/src/data/embeddings.py b/src/data/embeddings.py
index b5b253a..8005dad 100644
--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@@ -217,7 +217,7 @@ class StorageEmbeddings:
             print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
         return
 
-    def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space):
+    def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
         _optimal = dict()
         # TODO testing optimal max_label_space
         if max_label_space == 'optimal':
@@ -227,7 +227,7 @@ class StorageEmbeddings:
 
         for lang in docs.keys():
             print(f'# [supervised-matrix] for {lang}')
-            self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, lang)
+            self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang)
             print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
         return
 
@@ -241,7 +241,7 @@ class StorageEmbeddings:
         if config['unsupervised']:
             self._add_embeddings_unsupervised(config['we_type'], docs, vocs)
         if config['supervised']:
-            self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'])
+            self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
         return self
 
     def predict(self, config, docs):
@@ -269,10 +269,11 @@ class StorageEmbeddings:
 
         for lang in docs.keys():
             _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist()
-            plt.plot(np.cumsum(_r), label=lang)
+            _r = np.cumsum(_r)
+            plt.plot(_r, label=lang)
             for i in range(len(_r)-1, 1, -1):
                 # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ...
-                delta = _r[i-1] - _r[i]
+                delta = _r[i] - _r[i-1]
                 if delta > 0:
                     _idx.append(i)
                     break
diff --git a/src/data/supervised.py b/src/data/supervised.py
index 02f8c84..d8e1f7d 100755
--- a/src/data/supervised.py
+++ b/src/data/supervised.py
@@ -40,7 +40,7 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
     return F
 
 
-def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
+def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
     if max_label_space == 'optimal':
         max_label_space = 0
 
@@ -63,6 +63,18 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None',
     if dozscore:
         F = zscores(F, axis=0)
 
+    # Dumping F-matrix for further studies
+    # TODO im not sure if voc.keys and F matrix indices are "aligned" correctly
+    dump_it = True
+    if dump_it:
+        with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile:
+            np.savetxt(outfile, F, delimiter='\t')
+        with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile:
+            for token in voc.keys():
+                outfile.write(token+'\n')
+
+
+
     if nC > max_label_space:
         # TODO testing optimal max_label_space
         if reduction == 'PCA':
@@ -75,15 +87,6 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None',
                   f'Applying PCA(n_components={max_label_space})')
             pca = PCA(n_components=max_label_space)
             pca = pca.fit(F)
-            ########################################################
-            # import matplotlib.pyplot as plt
-            # plt.figure()
-            # plt.plot(np.cumsum(pca.explained_variance_ratio_))
-            # plt.xlabel('Number of Components')
-            # plt.ylabel('Variance (%)')  #
-            # plt.title(f'WCE Explained Variance {lang}')
-            # plt.show()
-            ########################################################
             F = pca.fit_transform(F)
         elif reduction == 'TSNE':
             print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '

From 9fa1899a7f1d3f73349bf20909aa0e98596fb31f Mon Sep 17 00:00:00 2001
From: andrea <andrea.pdr@hotmail.it>
Date: Mon, 9 Dec 2019 15:37:52 +0100
Subject: [PATCH 09/10] refactored pca methods

---
 src/FPEC_andrea.py         |  28 +++++---
 src/data/embeddings.py     | 133 +++++++++++++++++++++++++------------
 src/data/supervised.py     |  61 ++++++++---------
 src/learning/learners.py   |  17 ++---
 src/util/decompositions.py |  49 ++++++++++++++
 src/util/results.py        |   6 +-
 6 files changed, 199 insertions(+), 95 deletions(-)
 create mode 100644 src/util/decompositions.py

diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py
index 185bcc2..1618c33 100644
--- a/src/FPEC_andrea.py
+++ b/src/FPEC_andrea.py
@@ -1,4 +1,4 @@
-import os, sys
+import os
 from dataset_builder import MultilingualDataset
 from learning.learners import *
 from util.evaluation import *
@@ -21,7 +21,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed",
                   help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
 
 parser.add_option("-w", "--we-path", dest="we_path",
-                  help="Path to the polylingual word embeddings", default='../embeddings/')
+                  help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
 
 parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
                   default='MUSE')
@@ -30,11 +30,21 @@ parser.add_option("-s", "--set_c", dest="set_c",type=float,
                   help="Set the C parameter", default=1)
 
 parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
-                  help="Optimices hyperparameters", default=False)
+                  help="Optimize hyperparameters", default=False)
 
 parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
                   help="Number of parallel jobs (default is -1, all)", default=-1)
 
+parser.add_option("-p", "--pca", dest="max_labels", type=int,
+                  help="If less than number of target classes, will apply PCA to supervised matrix. If set to 0 it"
+                       " will automatically search for the best number of components", default=300)
+
+parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
+                  help="If smaller than Unsupervised Dimension, will apply PCA to unsupervised matrix. If set to 0 it"
+                       " will automatically search for the best number of components", default=300)
+
+parser.add_option("-l", dest="lang", type=str)
+
 
 def get_learner(calibrate=False, kernel='linear'):
     return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
@@ -51,7 +61,6 @@ def get_params(dense=False):
 
 
 if __name__ == '__main__':
-
     (op, args) = parser.parse_args()
 
     assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
@@ -64,8 +73,9 @@ if __name__ == '__main__':
     data = MultilingualDataset.load(op.dataset)
     data.show_dimensions()
 
-    # data.set_view(languages=['en','it'], categories=list(range(10)))
-    # data.set_view(languages=['en','it'])
+    data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
+    # data.set_view(languages=[op.lang])
+    # data.set_view(categories=list(range(10)))
     lXtr, lytr = data.training()
     lXte, lyte = data.test()
 
@@ -104,7 +114,9 @@ if __name__ == '__main__':
 
     ##### TODO - config dict is redundant - we have already op argparse ...
     config['reduction'] = 'PCA'
-    config['max_label_space'] = 300
+    config['max_label_space'] = op.max_labels
+    config['dim_reduction_unsupervised'] = op.max_labels_U
+    # config['plot_covariance_matrices'] = True
 
     result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
 
@@ -129,5 +141,5 @@ if __name__ == '__main__':
         metrics.append([macrof1, microf1, macrok, microk])
         print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
         results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1],
-                        'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, '')
+                        classifier.time, lang, macrof1, microf1, macrok, microk, '')
     print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
diff --git a/src/data/embeddings.py b/src/data/embeddings.py
index 8005dad..2c02592 100644
--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@@ -5,7 +5,9 @@ from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
 from data.supervised import get_supervised_embeddings
-
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from util.decompositions import *
 
 class PretrainedEmbeddings(ABC):
 
@@ -110,10 +112,10 @@ class WordEmbeddings:
         # vocabulary is a set of terms to be kept
         active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
         lost = len(vocabulary)-len(active_vocabulary)
-        if lost>0: #some termr are missing, so it will be replaced by UNK
+        if lost > 0: #some terms are missing, so it will be replaced by UNK
             print('warning: missing {} terms for lang {}'.format(lost, self.lang))
         self.we = self.get_vectors(active_vocabulary)
-        assert self.we.shape[0]==len(active_vocabulary)
+        assert self.we.shape[0] == len(active_vocabulary)
         self.dimword={i:w for i,w in enumerate(active_vocabulary)}
         self.worddim={w:i for i,w in enumerate(active_vocabulary)}
         return self
@@ -153,7 +155,6 @@ class FastTextWikiNews(Vectors):
         url = self.url_base.format(language)
         # name = self.path.format(language)
         name = cache + self._name.format(language)
-        # print(f'\n\nFASTEXTWIKI-NEW CLASS:\nurl = {url}\nname = {name}\ncache {cache}\nlanguage = {language}')
         super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
 
 
@@ -171,15 +172,17 @@ class EmbeddingsAligned(Vectors):
     def vocabulary(self):
         return set(self.stoi.keys())
 
-    def dim(self):
-        return self.dim
-
     def extract(self, words):
         source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
         extraction = torch.zeros((len(words), self.dim))
         extraction[source_idx] = self.vectors[target_idx]
         return extraction
 
+    def reduce(self, dim):
+        pca = PCA(n_components=dim)
+        self.vectors = pca.fit_transform(self.vectors)
+        return
+
 
 class FastTextMUSE(PretrainedEmbeddings):
 
@@ -209,26 +212,44 @@ class StorageEmbeddings:
         self.lang_U = dict()
         self.lang_S = dict()
 
-    def _add_embeddings_unsupervised(self, type, docs, vocs):
+    def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
         for lang in docs.keys():
+            nC = self.lang_U[lang].shape[1]
             print(f'# [unsupervised-matrix {type}] for {lang}')
             voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
             self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
+            # if self.lang_U[lang].shape[1] > dim != 0:
+            #     print(f'unsupervised matrix has more dimensions ({self.lang_U[lang].shape[1]}) than'
+            #           f' the allowed limit {dim}. Applying PCA(n_components={dim})')
+            #     pca = PCA(n_components=dim)
+            #     self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
             print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
+        if max_label_space == 0:
+            print(f'Computing optimal number of PCA components along matrices U')
+            optimal_n = get_optimal_dim(self.lang_U, 'U')
+            self.lang_U = run_pca(optimal_n, self.lang_U)
+        elif max_label_space < nC:
+            self.lang_U = run_pca(max_label_space, self.lang_U)
+
         return
 
     def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
-        _optimal = dict()
-        # TODO testing optimal max_label_space
-        if max_label_space == 'optimal':
-            print('Computing optimal number of PCA components ...')
-            optimal_n = self.get_optimal_supervised_components(docs, labels)
-            max_label_space = optimal_n
-
-        for lang in docs.keys():
+        # if max_label_space == 0:
+        #     print('Computing optimal number of PCA components along matrices S...')
+        #     optimal_n = self.get_optimal_supervised_components(docs, labels)
+        #     max_label_space = optimal_n
+        for lang in docs.keys():    # compute supervised matrices S - then apply PCA
+            nC = self.lang_S[lang].shape[1]
             print(f'# [supervised-matrix] for {lang}')
             self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang)
             print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
+
+        if max_label_space == 0:
+            optimal_n = get_optimal_dim(self.lang_S, 'S')
+            self.lang_S = run_pca(optimal_n, self.lang_S)
+        elif max_label_space < nC:
+            self.lang_S = run_pca(max_label_space, self.lang_S)
+
         return
 
     def _concatenate_embeddings(self, docs):
@@ -239,7 +260,7 @@ class StorageEmbeddings:
 
     def fit(self, config, docs, vocs, labels):
         if config['unsupervised']:
-            self._add_embeddings_unsupervised(config['we_type'], docs, vocs)
+            self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
         if config['supervised']:
             self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
         return self
@@ -257,28 +278,58 @@ class StorageEmbeddings:
                 _r[lang] = docs[lang].dot(self.lang_U[lang])
         return _r
 
-    def get_optimal_supervised_components(self, docs, labels):
-        import matplotlib.pyplot as plt
+    # @staticmethod
+    # def get_optimal_supervised_components(docs, labels):
+    #     optimal_n = get_optimal_dim(docs, 'S')
+    #     return optimal_n
+        # _idx = []
+        #
+        # plt.figure(figsize=(15, 10))
+        # plt.title(f'WCE Explained Variance')
+        # plt.xlabel('Number of Components')
+        # plt.ylabel('Variance (%)')
+        #
+        # for lang in docs.keys():
+        #     _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist()
+        #     _r = np.cumsum(_r)
+        #     plt.plot(_r, label=lang)
+        #     for i in range(len(_r)-1, 1, -1):
+        #         delta = _r[i] - _r[i-1]
+        #         if delta > 0:
+        #             _idx.append(i)
+        #             break
+        # best_n = max(_idx)
+        # plt.axvline(best_n, color='r', label='optimal N')
+        # plt.legend()
+        # plt.show()
+        # return best_n
+    #
+    # def get_optimal_unsupervised_components(self, type):
+    #     _idx = []
+    #
+    #     plt.figure(figsize=(15, 10))
+    #     plt.title(f'Unsupervised Embeddings {type} Explained Variance')
+    #     plt.xlabel('Number of Components')
+    #     plt.ylabel('Variance (%)')
+    #
+    #     for lang in self.lang_U.keys():
+    #         pca = PCA(n_components=self.lang_U[lang].shape[1])
+    #         pca.fit(self.lang_U[lang])
+    #         _r = pca.explained_variance_ratio_
+    #         _r = np.cumsum(_r)
+    #         plt.plot(_r, label=lang)
+    #         for i in range(len(_r) - 1, 1, -1):
+    #             delta = _r[i] - _r[i - 1]
+    #             if delta > 0:
+    #                 _idx.append(i)
+    #                 break
+    #     best_n = max(_idx)
+    #     plt.axvline(best_n, color='r', label='optimal N')
+    #     plt.legend()
+    #     plt.show()
+    #
+    #     for lang in self.lang_U.keys():
+    #         pca = PCA(n_components=best_n)
+    #         self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
+    #     return
 
-        _idx = []
-
-        plt.figure(figsize=(15, 10))
-        plt.title(f'WCE Explained Variance')
-        plt.xlabel('Number of Components')
-        plt.ylabel('Variance (%)')
-
-        for lang in docs.keys():
-            _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist()
-            _r = np.cumsum(_r)
-            plt.plot(_r, label=lang)
-            for i in range(len(_r)-1, 1, -1):
-                # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ...
-                delta = _r[i] - _r[i-1]
-                if delta > 0:
-                    _idx.append(i)
-                    break
-        best_n = int(sum(_idx)/len(_idx))
-        plt.vlines(best_n, 0, 1, colors='r', label='optimal N')
-        plt.legend()
-        plt.show()
-        return best_n
diff --git a/src/data/supervised.py b/src/data/supervised.py
index d8e1f7d..bbd8c37 100755
--- a/src/data/supervised.py
+++ b/src/data/supervised.py
@@ -1,5 +1,5 @@
 from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
-from sklearn.decomposition import PCA, TruncatedSVD
+from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE
 import numpy as np
 
@@ -41,15 +41,9 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
 
 
 def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
-    if max_label_space == 'optimal':
-        max_label_space = 0
-
     if max_label_space != 0:
         print('computing supervised embeddings...')
-
     nC = Y.shape[1]
-    if nC==2 and binary_structural_problems > nC:
-        raise ValueError('not implemented in this branch')
 
     if method=='ppmi':
         F = supervised_embeddings_ppmi(X, Y)
@@ -64,8 +58,7 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la
         F = zscores(F, axis=0)
 
     # Dumping F-matrix for further studies
-    # TODO im not sure if voc.keys and F matrix indices are "aligned" correctly
-    dump_it = True
+    dump_it = False
     if dump_it:
         with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile:
             np.savetxt(outfile, F, delimiter='\t')
@@ -73,34 +66,32 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la
             for token in voc.keys():
                 outfile.write(token+'\n')
 
-
-
-    if nC > max_label_space:
-        # TODO testing optimal max_label_space
-        if reduction == 'PCA':
-            if max_label_space == 0:
-                pca = PCA(n_components=Y.shape[1])
-                pca = pca.fit(F)
-                return pca.explained_variance_ratio_
-
-            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
-                  f'Applying PCA(n_components={max_label_space})')
-            pca = PCA(n_components=max_label_space)
-            pca = pca.fit(F)
-            F = pca.fit_transform(F)
-        elif reduction == 'TSNE':
-            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
-                  f'Applying t-SNE(n_components={max_label_space})')
-            tsne = TSNE(n_components=max_label_space)
-            F = tsne.fit_transform(F)
-        elif reduction == 'tSVD':
-            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
-                  f'Applying truncatedSVD(n_components={max_label_space})')
-            tSVD = TruncatedSVD(n_components=max_label_space)
-            F = tSVD.fit_transform(F)
-
     return F
 
+    # if nC >= max_label_space:
+    #     if reduction == 'PCA':
+    #         if max_label_space == 0:
+    #             pca = PCA(n_components=Y.shape[1])
+    #             pca = pca.fit(F)
+    #             return pca.explained_variance_ratio_
+    #
+    #         print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
+    #               f'Applying PCA(n_components={max_label_space})')
+    #         pca = PCA(n_components=max_label_space)
+    #         F = pca.fit_transform(F)
+    #     elif reduction == 'TSNE':
+    #         print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
+    #               f'Applying t-SNE(n_components={max_label_space})')
+    #         tsne = TSNE(n_components=max_label_space)
+    #         F = tsne.fit_transform(F)
+    #     elif reduction == 'tSVD':
+    #         print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
+    #               f'Applying truncatedSVD(n_components={max_label_space})')
+    #         tSVD = TruncatedSVD(n_components=max_label_space)
+    #         F = tSVD.fit_transform(F)
+    #
+    # return F
+
 
 
 
diff --git a/src/learning/learners.py b/src/learning/learners.py
index aed1094..c4c69fd 100644
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@@ -8,6 +8,7 @@ from sklearn.model_selection import KFold
 from joblib import Parallel, delayed
 from sklearn.feature_extraction.text import TfidfVectorizer
 from transformers.StandardizeTransformer import StandardizeTransformer
+from sklearn.decomposition import PCA
 
 
 def _sort_if_sparse(X):
@@ -453,13 +454,12 @@ class AndreaCLF(FunnellingPolylingualClassifier):
                          calmode,
                          n_jobs)
 
+        self.pca_independent_space = PCA(n_components=100)
         self.we_path = we_path
         self.config = config
         self.lang_word2idx = dict()
         self.languages = []
         self.lang_tfidf = {}
-        # self.word_embeddings = {}
-        # self.supervised_embeddings = {}
         self.embedding_space = None
         self.model = None
         self.time = None
@@ -515,6 +515,10 @@ class AndreaCLF(FunnellingPolylingualClassifier):
         _vertical_Z = np.vstack([Z[lang] for lang in self.languages])
         _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
 
+        # todo testing ...
+        # self.pca_independent_space.fit(_vertical_Z)
+        # _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
+
         self.standardizer = StandardizeTransformer()
         _vertical_Z = self.standardizer.fit_predict(_vertical_Z)
 
@@ -532,17 +536,14 @@ class AndreaCLF(FunnellingPolylingualClassifier):
 
         if self.config['supervised'] or self.config['unsupervised']:
             _embedding_space = self.embedding_space.predict(self.config, lX)
-            # l_weighted_em = self.embed(lX, ly,
-            #                            unsupervised=self.config['unsupervised'],
-            #                            supervised=self.config['supervised'],
-            #                            prediction=True)
-            # Z_embedded = dict()
+
             for lang in lX.keys():
                 lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
-            # lZ = Z_embedded
 
         for lang in lZ.keys():
             print(lZ[lang].shape)
+            # todo testing
+            # lZ[lang] = self.pca_independent_space.transform(lZ[lang])
             lZ[lang] = self.standardizer.predict(lZ[lang])
 
         return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
diff --git a/src/util/decompositions.py b/src/util/decompositions.py
new file mode 100644
index 0000000..9029b33
--- /dev/null
+++ b/src/util/decompositions.py
@@ -0,0 +1,49 @@
+from sklearn.decomposition import PCA
+import numpy as np
+import matplotlib.pyplot as plt
+
+def run_pca(dim, X):
+    """
+    :param dim: number of pca components to keep
+    :param X: dictionary str(lang): matrix
+    :return: dict lang: reduced matrix
+    """
+    r = dict()
+    pca = PCA(n_components=dim)
+    for lang in X.keys():
+        r[lang] = pca.fit_transform(X[lang])
+    return r
+
+
+def get_optimal_dim(X, embed_type):
+    """
+    :param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised
+    :param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT)
+    :return:
+    """
+    _idx = []
+
+    plt.figure(figsize=(15, 10))
+    if embed_type == 'U':
+        plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance')
+    else:
+        plt.title(f'WCE Explained Variance')
+    plt.xlabel('Number of Components')
+    plt.ylabel('Variance (%)')
+
+    for lang in X.keys():
+        pca = PCA(n_components=X[lang].shape[1])
+        pca.fit(X[lang])
+        _r = pca.explained_variance_ratio_
+        _r = np.cumsum(_r)
+        plt.plot(_r, label=lang)
+        for i in range(len(_r) - 1, 1, -1):
+            delta = _r[i] - _r[i - 1]
+            if delta > 0:
+                _idx.append(i)
+                break
+    best_n = max(_idx)
+    plt.axvline(best_n, color='r', label='optimal N')
+    plt.legend()
+    plt.show()
+    return best_n
\ No newline at end of file
diff --git a/src/util/results.py b/src/util/results.py
index 22e8021..7c25bec 100644
--- a/src/util/results.py
+++ b/src/util/results.py
@@ -5,7 +5,7 @@ import numpy as np
 class PolylingualClassificationResults:
     def __init__(self, file, autoflush=True, verbose=False):
         self.file = file
-        self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
+        self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
         self.autoflush = autoflush
         self.verbose = verbose
         if os.path.exists(file):
@@ -20,8 +20,8 @@ class PolylingualClassificationResults:
     def already_calculated(self, id):
         return (self.df['id'] == id).any()
 
-    def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
-        s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
+    def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
+        s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
         self.df = self.df.append(s, ignore_index=True)
         if self.autoflush: self.flush()
         self.tell(s.to_string())

From 0c6056e7a13aafdcfe03b6688298533837e03747 Mon Sep 17 00:00:00 2001
From: andrea <andrea.pdr@hotmail.it>
Date: Mon, 9 Dec 2019 15:39:39 +0100
Subject: [PATCH 10/10] refactored pca methods

---
 src/data/embeddings.py   | 69 +++-------------------------------------
 src/data/supervised.py   |  4 +--
 src/learning/learners.py |  2 +-
 3 files changed, 8 insertions(+), 67 deletions(-)

diff --git a/src/data/embeddings.py b/src/data/embeddings.py
index 2c02592..66e830f 100644
--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@@ -1,14 +1,12 @@
 import os
 import pickle
-import numpy as np
 from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
 from data.supervised import get_supervised_embeddings
-import matplotlib.pyplot as plt
-from sklearn.decomposition import PCA
 from util.decompositions import *
 
+
 class PretrainedEmbeddings(ABC):
 
     def __init__(self):
@@ -112,7 +110,7 @@ class WordEmbeddings:
         # vocabulary is a set of terms to be kept
         active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
         lost = len(vocabulary)-len(active_vocabulary)
-        if lost > 0: #some terms are missing, so it will be replaced by UNK
+        if lost > 0:    # some terms are missing, so it will be replaced by UNK
             print('warning: missing {} terms for lang {}'.format(lost, self.lang))
         self.we = self.get_vectors(active_vocabulary)
         assert self.we.shape[0] == len(active_vocabulary)
@@ -134,12 +132,12 @@ class WordEmbeddings:
             'instances of {} expected'.format(WordEmbeddings.__name__)
 
         polywe = []
-        worddim={}
-        offset=0
+        worddim = {}
+        offset = 0
         for we in we_list:
             polywe.append(we.we)
             worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
-            offset=len(worddim)
+            offset = len(worddim)
         polywe = np.vstack(polywe)
 
         return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
@@ -191,7 +189,6 @@ class FastTextMUSE(PretrainedEmbeddings):
         print(f'Loading fastText pretrained vectors from {path}')
         assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
         self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
-        # print('Done')
 
     def vocabulary(self):
         return set(self.embed.stoi.keys())
@@ -277,59 +274,3 @@ class StorageEmbeddings:
             for lang in docs.keys():
                 _r[lang] = docs[lang].dot(self.lang_U[lang])
         return _r
-
-    # @staticmethod
-    # def get_optimal_supervised_components(docs, labels):
-    #     optimal_n = get_optimal_dim(docs, 'S')
-    #     return optimal_n
-        # _idx = []
-        #
-        # plt.figure(figsize=(15, 10))
-        # plt.title(f'WCE Explained Variance')
-        # plt.xlabel('Number of Components')
-        # plt.ylabel('Variance (%)')
-        #
-        # for lang in docs.keys():
-        #     _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist()
-        #     _r = np.cumsum(_r)
-        #     plt.plot(_r, label=lang)
-        #     for i in range(len(_r)-1, 1, -1):
-        #         delta = _r[i] - _r[i-1]
-        #         if delta > 0:
-        #             _idx.append(i)
-        #             break
-        # best_n = max(_idx)
-        # plt.axvline(best_n, color='r', label='optimal N')
-        # plt.legend()
-        # plt.show()
-        # return best_n
-    #
-    # def get_optimal_unsupervised_components(self, type):
-    #     _idx = []
-    #
-    #     plt.figure(figsize=(15, 10))
-    #     plt.title(f'Unsupervised Embeddings {type} Explained Variance')
-    #     plt.xlabel('Number of Components')
-    #     plt.ylabel('Variance (%)')
-    #
-    #     for lang in self.lang_U.keys():
-    #         pca = PCA(n_components=self.lang_U[lang].shape[1])
-    #         pca.fit(self.lang_U[lang])
-    #         _r = pca.explained_variance_ratio_
-    #         _r = np.cumsum(_r)
-    #         plt.plot(_r, label=lang)
-    #         for i in range(len(_r) - 1, 1, -1):
-    #             delta = _r[i] - _r[i - 1]
-    #             if delta > 0:
-    #                 _idx.append(i)
-    #                 break
-    #     best_n = max(_idx)
-    #     plt.axvline(best_n, color='r', label='optimal N')
-    #     plt.legend()
-    #     plt.show()
-    #
-    #     for lang in self.lang_U.keys():
-    #         pca = PCA(n_components=best_n)
-    #         self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
-    #     return
-
diff --git a/src/data/supervised.py b/src/data/supervised.py
index bbd8c37..d2d7aab 100755
--- a/src/data/supervised.py
+++ b/src/data/supervised.py
@@ -1,7 +1,7 @@
 from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
-from sklearn.decomposition import PCA
-from sklearn.manifold import TSNE
 import numpy as np
+# from sklearn.decomposition import PCA
+# from sklearn.manifold import TSNE
 
 
 def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
diff --git a/src/learning/learners.py b/src/learning/learners.py
index c4c69fd..96e200c 100644
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@@ -8,7 +8,7 @@ from sklearn.model_selection import KFold
 from joblib import Parallel, delayed
 from sklearn.feature_extraction.text import TfidfVectorizer
 from transformers.StandardizeTransformer import StandardizeTransformer
-from sklearn.decomposition import PCA
+# from sklearn.decomposition import PCA
 
 
 def _sort_if_sparse(X):