baseline multilingual Bert

2020-07-27 11:56:09 +02:00 · 2020-07-27 11:56:09 +02:00 · d1fdad5f6e
parent 22b7ea7e66
commit d1fdad5f6e
37 changed files with 1212 additions and 1112 deletions
--- a/src/embeddings/embeddings.py
+++ b/src/embeddings/embeddings.py
@ -1,10 +1,7 @@
 import os
 import pickle
 from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
 from embeddings.supervised import get_supervised_embeddings
 from util.decompositions import *
 from util.SIF_embed import *
@ -35,122 +32,10 @@ class PretrainedEmbeddings(ABC):
        return source_idx, target_idx
 class WordEmbeddings:
    def __init__(self, lang, we, worddim):
        self.lang = lang
        self.we = we
        self.worddim = worddim
        self.dimword = {v:k for k,v in self.worddim.items()}
    @classmethod
    def load(cls, basedir, lang, word_preprocessor=None, dopickle=True):
        filename = 'wiki.multi.{}.vec'.format(lang)
        we_path = os.path.join(basedir, filename)
        if dopickle and os.path.exists(we_path + '.pkl'):
            print('loading pkl in {}'.format(we_path + '.pkl'))
            (worddim, we) = pickle.load(open(we_path + '.pkl', 'rb'))
        else:
            word_registry = set()
            lines = open(we_path).readlines()
            nwords, dims = [int(x) for x in lines[0].split()]
            print('reading we of {} dimensions'.format(dims))
            we = np.zeros((nwords, dims), dtype=float)
            worddim = {}
            index = 0
            for i, line in enumerate(lines[1:]):
                if (i + 1) % 100 == 0:
                    print('\r{}/{}'.format(i + 1, len(lines)), end='')
                word, *vals = line.split()
                wordp = word_preprocessor(word) if word_preprocessor is not None else word
                if wordp:
                    wordp = wordp[0]
                    if wordp in word_registry:
                        print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp))
                    elif len(vals) == dims:
                        worddim[wordp] = index
                        we[index, :] = np.array(vals).astype(float)
                        index += 1
                # else:
                #     print('warning: word <{}> generates an empty string after preprocessing'.format(word))
            we = we[:index]
            print('load {} words'.format(index))
            if dopickle:
                print('saving...')
                pickle.dump((worddim, we), open(we_path + '.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
        return WordEmbeddings(lang, we, worddim)
    def vocabulary(self):
        return set(self.worddim.keys())
    def __getitem__(self, key):
        return self.we[self.worddim[key]]
    def dim(self):
        return self.we.shape[1]
    def __contains__(self, key):
        return key in self.worddim
    def most_similar(self, word_vect, k):
        if word_vect.ndim == 1:
            word_vect = word_vect.reshape(1,-1)
        assert word_vect.shape[1] == self.dim(), 'inconsistent dimensions'
        sim = np.dot(word_vect,self.we.T)
        order = np.argsort(-1*sim, axis=1)[:,:k]
        similar_words = [[self.dimword[order[vi,ki]] for ki in range(k)] for vi in range(word_vect.shape[0])]
        sim_scores = sim[:,order]
        return similar_words, sim_scores
    def get_vectors(self, wordlist):
        indexes = np.array([self.worddim[w] for w in wordlist])
        return self.we[indexes]
    def restrict(self, vocabulary):
        # vocabulary is a set of terms to be kept
        active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
        lost = len(vocabulary)-len(active_vocabulary)
        if lost > 0:    # some terms are missing, so it will be replaced by UNK
            print('warning: missing {} terms for lang {}'.format(lost, self.lang))
        self.we = self.get_vectors(active_vocabulary)
        assert self.we.shape[0] == len(active_vocabulary)
        self.dimword={i:w for i,w in enumerate(active_vocabulary)}
        self.worddim={w:i for i,w in enumerate(active_vocabulary)}
        return self
    @classmethod
    def load_poly(cls, basedir, langs, lang_vocabularies=None, word_preprocessor=None):
        if lang_vocabularies is None:
            return cls.merge([cls.load(basedir,lang, word_preprocessor) for lang in langs])
        else:
            # assert all([l in lang_vocabularies for l in langs]), 'missing vocabulary for some languages'
            return cls.merge([cls.load(basedir, lang, word_preprocessor).restrict(lang_vocabularies[lang]) for lang in langs])
    @classmethod
    def merge(cls, we_list):
        assert all([isinstance(we, WordEmbeddings) for we in we_list]), \
            'instances of {} expected'.format(WordEmbeddings.__name__)
        polywe = []
        worddim = {}
        offset = 0
        for we in we_list:
            polywe.append(we.we)
            worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
            offset = len(worddim)
        polywe = np.vstack(polywe)
        return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
 class FastTextWikiNews(Vectors):
    url_base = 'Cant auto-download MUSE embeddings'
-    path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
+    path = '../embeddings/wiki.multi.{}.vec'
    _name = '/wiki.multi.{}.vec'
    def __init__(self, cache, language="en", **kwargs):
@ -159,42 +44,13 @@ class FastTextWikiNews(Vectors):
        super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
 class EmbeddingsAligned(Vectors):
    def __init__(self, type, path, lang, voc):
        # todo - rewrite as relative path
        self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
        self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
        self.path = path + self.name.format(lang)
        assert os.path.exists(path), f'pre-trained vectors not found in {path}'
        super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
        self.vectors = self.extract(voc)
    def vocabulary(self):
        return set(self.stoi.keys())
    def extract(self, words):
        source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
        extraction = torch.zeros((len(words), self.dim))
        extraction[source_idx] = self.vectors[target_idx]
        return extraction
    def reduce(self, dim):
        pca = PCA(n_components=dim)
        self.vectors = pca.fit_transform(self.vectors)
        return
 class FastTextMUSE(PretrainedEmbeddings):
    def __init__(self, path, lang, limit=None):
        super().__init__()
        print(f'Loading fastText pretrained vectors for language {lang} from {path}')
        assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
        self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
    def vocabulary(self):
        return set(self.embed.stoi.keys())
@ -204,114 +60,8 @@ class FastTextMUSE(PretrainedEmbeddings):
    def extract(self, words):
        source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
        extraction = torch.zeros((len(words), self.dim()))
        # extraction = torch.empty(len(words), self.dim()).normal_(0, 1)
        extraction[source_idx] = self.embed.vectors[target_idx]
        return extraction
 class StorageEmbeddings:
    def __init__(self, path):
        self.path = path
        self.lang_U = dict()
        self.lang_S = dict()
    def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
        for lang in docs.keys():
            print(f'# [unsupervised-matrix {type}] for {lang}')
            voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
            self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
            print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
            nC = self.lang_U[lang].shape[1]
        if max_label_space == 0:
            print(f'Computing optimal number of PCA components along matrices U')
            optimal_n = get_optimal_dim(self.lang_U, 'U')
            self.lang_U = run_pca(optimal_n, self.lang_U)
        elif max_label_space < nC:
            print(f'Applying PCA to unsupervised matrix U')
            self.lang_U = run_pca(max_label_space, self.lang_U)
        return
    def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
        only_well_represented_C = False  # TODO testing
        if only_well_represented_C:
            labels = labels.copy()
            min_prevalence = 0
            print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...')
            langs = list(docs.keys())
            well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs])
            for lang in langs:
                labels[lang] = labels[lang][:, well_repr_cats]
            print(f'Target number reduced to: {labels[lang].shape[1]}\n')
        for lang in docs.keys():    # compute supervised matrices S - then apply PCA
            print(f'# [supervised-matrix] for {lang}')
            self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
                                                          reduction, max_label_space, voc[lang], lang)
            nC = self.lang_S[lang].shape[1]
            print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
        if max_label_space == 0:    # looking for best n_components analyzing explained_variance_ratio
            print(f'Computing optimal number of PCA components along matrices S')
            optimal_n = get_optimal_dim(self.lang_S, 'S')
            print(f'Applying PCA(n_components={optimal_n})')
            self.lang_S = run_pca(optimal_n, self.lang_S)
        elif max_label_space == -1: # applying pca to the verticals stacked matrix of WCE embeddings
            print(f'Computing PCA on vertical stacked WCE embeddings')
            languages = self.lang_S.keys()
            _temp_stack = np.vstack([self.lang_S[lang] for lang in languages])  # stacking WCE vertically
            stacked_pca = PCA(n_components=_temp_stack.shape[1])
            stacked_pca.fit(_temp_stack)
            best_n = None
            _r = stacked_pca.explained_variance_ratio_
            _r = np.cumsum(_r)
            plt.plot(_r, label='Stacked Supervised')
            for i in range(len(_r) - 1, 1, -1):
                delta = _r[i] - _r[i - 1]
                if delta > 0:
                    best_n = i
                    break
            plt.show()
            stacked_pca = PCA(n_components=best_n)
            stacked_pca.fit(_temp_stack)
            print(f'Applying PCA(n_components={i}')
            for lang in languages:
                self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
        elif max_label_space <= nC: # less or equal in order to reduce it to the same initial dimension
            print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})')
            self.lang_S = run_pca(max_label_space, self.lang_S)
        return
    def SIF_embeddings(self):
        print('todo') # TODO
    def _concatenate_embeddings(self, docs):
        _r = dict()
        for lang in self.lang_U.keys():
            _r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang])))
        return _r
    def fit(self, config, docs, vocs, labels):
        if config['unsupervised']:
            self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
        if config['supervised']:
            self._add_embeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
        return self
    def predict(self, config, docs):
        if config['supervised'] and config['unsupervised']:
            return self._concatenate_embeddings(docs)
            # todo testing applying pca to hstack muse + wce
            # _reduced = self._concatenate_embeddings(docs)
            # return run_pca(300, _reduced)
        elif config['supervised']:
            _r = dict()
            for lang in docs.keys():
                _r[lang] = docs[lang].dot(self.lang_S[lang])
        else:
            _r = dict()
            for lang in docs.keys():
                _r[lang] = docs[lang].dot(self.lang_U[lang])
        return _r
--- a/src/embeddings/pretrained.py
+++ b/src/embeddings/pretrained.py
@ -1,103 +1,102 @@
 from abc import ABC, abstractmethod
 import torch, torchtext
-import gensim
+# import gensim
-import os
+# import os
 import numpy as np
-class KeyedVectors:
+# class KeyedVectors:
-
+#
-    def __init__(self, word2index, weights):
+#     def __init__(self, word2index, weights):
-        assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
+#         assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
-        index2word = {i:w for w,i in word2index.items()}
+#         index2word = {i:w for w,i in word2index.items()}
-        assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
+#         assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
-        self.word2index = word2index
+#         self.word2index = word2index
-        self.index2word = index2word
+#         self.index2word = index2word
-        self.weights = weights
+#         self.weights = weights
-
+#
-    def extract(self, words):
+#     def extract(self, words):
-        dim = self.weights.shape[1]
+#         dim = self.weights.shape[1]
-        v_size = len(words)
+#         v_size = len(words)
-
+#
-        source_idx, target_idx = [], []
+#         source_idx, target_idx = [], []
-        for i,word in enumerate(words):
+#         for i,word in enumerate(words):
-            if word not in self.word2index: continue
+#             if word not in self.word2index: continue
-            j = self.word2index[word]
+#             j = self.word2index[word]
-            source_idx.append(i)
+#             source_idx.append(i)
-            target_idx.append(j)
+#             target_idx.append(j)
-
+#
-        extraction = np.zeros((v_size, dim))
+#         extraction = np.zeros((v_size, dim))
-        extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
+#         extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
-
+#
-        return extraction
+#         return extraction
-
+# class PretrainedEmbeddings(ABC):
-class PretrainedEmbeddings(ABC):
+#
-
+#     def __init__(self):
-    def __init__(self):
+#         super().__init__()
-        super().__init__()
+#
-
+#     @abstractmethod
-    @abstractmethod
+#     def vocabulary(self): pass
-    def vocabulary(self): pass
+#
-
+#     @abstractmethod
-    @abstractmethod
+#     def dim(self): pass
-    def dim(self): pass
+#
-
+#     @classmethod
-    @classmethod
+#     def reindex(cls, words, word2index):
-    def reindex(cls, words, word2index):
+#         source_idx, target_idx = [], []
-        source_idx, target_idx = [], []
+#         for i, word in enumerate(words):
-        for i, word in enumerate(words):
+#             if word not in word2index: continue
-            if word not in word2index: continue
+#             j = word2index[word]
-            j = word2index[word]
+#             source_idx.append(i)
-            source_idx.append(i)
+#             target_idx.append(j)
-            target_idx.append(j)
+#         source_idx = np.asarray(source_idx)
-        source_idx = np.asarray(source_idx)
+#         target_idx = np.asarray(target_idx)
-        target_idx = np.asarray(target_idx)
+#         return source_idx, target_idx
        return source_idx, target_idx
-class GloVe(PretrainedEmbeddings):
+# class GloVe(PretrainedEmbeddings):
-
+#
-    def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
+#     def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
-        super().__init__()
+#         super().__init__()
-        print(f'Loading GloVe pretrained vectors from torchtext')
+#         print(f'Loading GloVe pretrained vectors from torchtext')
-        self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
+#         self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
-        print('Done')
+#         print('Done')
-
+#
-    def vocabulary(self):
+#     def vocabulary(self):
-        return set(self.embed.stoi.keys())
+#         return set(self.embed.stoi.keys())
-
+#
-    def dim(self):
+#     def dim(self):
-        return self.embed.dim
+#         return self.embed.dim
-
+#
-    def extract(self, words):
+#     def extract(self, words):
-        source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
+#         source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
-        extraction = torch.zeros((len(words), self.dim()))
+#         extraction = torch.zeros((len(words), self.dim()))
-        extraction[source_idx] = self.embed.vectors[target_idx]
+#         extraction[source_idx] = self.embed.vectors[target_idx]
-        return extraction
+#         return extraction
-class Word2Vec(PretrainedEmbeddings):
+# class Word2Vec(PretrainedEmbeddings):
-
+#
-    def __init__(self, path, limit=None):
+#     def __init__(self, path, limit=None):
-        super().__init__()
+#         super().__init__()
-        print(f'Loading word2vec pretrained vectors from {path}')
+#         print(f'Loading word2vec pretrained vectors from {path}')
-        assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
+#         assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
-        self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
+#         self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
-        self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
+#         self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
-        print('Done')
+#         print('Done')
-
+#
-    def vocabulary(self):
+#     def vocabulary(self):
-        return set(self.word2index.keys())
+#         return set(self.word2index.keys())
-
+#
-    def dim(self):
+#     def dim(self):
-        return self.embed.vector_size
+#         return self.embed.vector_size
-
+#
-    def extract(self, words):
+#     def extract(self, words):
-        source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
+#         source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
-        extraction = np.zeros((len(words), self.dim()))
+#         extraction = np.zeros((len(words), self.dim()))
-        extraction[source_idx] = self.embed.vectors[target_idx]
+#         extraction[source_idx] = self.embed.vectors[target_idx]
-        extraction = torch.from_numpy(extraction).float()
+#         extraction = torch.from_numpy(extraction).float()
-        return extraction
+#         return extraction
--- a/src/embeddings/supervised.py
+++ b/src/embeddings/supervised.py
@ -1,7 +1,5 @@
 from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
 import numpy as np
 # from sklearn.decomposition import PCA
 # from sklearn.manifold import TSNE
 def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
@ -69,31 +67,6 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la
    return F
    # if nC >= max_label_space:
    #     if reduction == 'PCA':
    #         if max_label_space == 0:
    #             pca = PCA(n_components=Y.shape[1])
    #             pca = pca.fit(F)
    #             return pca.explained_variance_ratio_
    #
    #         print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
    #               f'Applying PCA(n_components={max_label_space})')
    #         pca = PCA(n_components=max_label_space)
    #         F = pca.fit_transform(F)
    #     elif reduction == 'TSNE':
    #         print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
    #               f'Applying t-SNE(n_components={max_label_space})')
    #         tsne = TSNE(n_components=max_label_space)
    #         F = tsne.fit_transform(F)
    #     elif reduction == 'tSVD':
    #         print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
    #               f'Applying truncatedSVD(n_components={max_label_space})')
    #         tSVD = TruncatedSVD(n_components=max_label_space)
    #         F = tSVD.fit_transform(F)
    #
    # return F
--- a/src/experiment_scripts/10run_dl_jrc.sh
+++ b/src/experiment_scripts/10run_dl_jrc.sh
@ -0,0 +1,11 @@
 #!/usr/bin/env bash
 dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
 logfile=../log/log10run_dl_jrc.csv
 runs='0 1 2 3 4 5 6 7 8 9'
 for run in $runs
 do
  dataset=$dataset_path$run.pickle
  python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
 done
--- a/src/experiment_scripts/10run_dl_rcv.sh
+++ b/src/experiment_scripts/10run_dl_rcv.sh
@ -0,0 +1,11 @@
 #!/usr/bin/env bash
 dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
 logfile=../log/log10run_dl_rcv.csv
 runs='0 1 2 3 4 5 6 7 8 9'
 for run in $runs
 do
  dataset=$dataset_path$run.pickle
  python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
 done
--- a/src/experiment_scripts/10run_jrc.sh
+++ b/src/experiment_scripts/10run_jrc.sh
@ -0,0 +1,12 @@
 dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
 logfile=./results/10run_jrc_final_results.csv
 runs='0 1 2 3 4 5 6 7 8 9'
 for run in $runs
 do
  dataset=$dataset_path$run.pickle
  python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
  python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
  python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
 done
--- a/src/experiment_scripts/10run_jrc_combinations.sh
+++ b/src/experiment_scripts/10run_jrc_combinations.sh
@ -0,0 +1,16 @@
 dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
 logfile=./results/funnelling_10run_jrc_CIKM.csv
 runs='6 7 8 9' #0 1 2 3 4 5
 for run in $runs
 do
  dataset=$dataset_path$run.pickle
  #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated  (done up to run5)
  python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
  #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
  #python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
  #python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
  #python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
  #python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2
  #python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2
 done
--- a/src/experiment_scripts/10run_rcv.sh
+++ b/src/experiment_scripts/10run_rcv.sh
@ -0,0 +1,15 @@
 dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
 logfile=./results/10run_rcv_final_results.csv
 runs='0 1 2 3 4 5 6 7 8 9'
 for run in $runs
 do
  dataset=$dataset_path$run.pickle
  python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
  python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
  python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
 done
--- a/src/experiment_scripts/10run_rcv_combinations.sh
+++ b/src/experiment_scripts/10run_rcv_combinations.sh
@ -0,0 +1,16 @@
 dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
 logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv
 runs='0 1 2 3 4 5 6 7 8 9'
 for run in $runs
 do
  dataset=$dataset_path$run.pickle
  #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated
  python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
  #python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
  #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
  #python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
  #python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
  #python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2
  #python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2
 done
--- a/src/experiment_scripts/run_combinations_jrc.sh
+++ b/src/experiment_scripts/run_combinations_jrc.sh
@ -0,0 +1,34 @@
 #!/usr/bin/env bash
 dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
 logfile=./results/final_combinations_jrc.csv
 #A.2: ensembling feature sets (combinations of posteriors, wce, muse):
 #	- exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
 #		(no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
 # aggregation=concatenation
 #python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2
 #python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2
 #python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2
 #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2
 #
 ##FeatureSetToPosteriors (aggregation mean)
 python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
 python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
 python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
 python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
 ##FeatureSetToPosteriors
 #python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob
 #python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob
 #python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob
 #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
 #MajorityVoting
 #python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
 #python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
 #python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
 #python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r
--- a/src/experiment_scripts/run_combinations_rcv.sh
+++ b/src/experiment_scripts/run_combinations_rcv.sh
@ -0,0 +1,31 @@
 #!/usr/bin/env bash
 dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
 logfile=./results/final_combinations_rcv.csv
 #A.2: ensembling feature sets (combinations of posteriors, wce, muse):
 #	- exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
 #		(no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
 # aggregation=concatenation
 #python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2
 #python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2
 #python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2
 #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2
 #
 ##FeatureSetToPosteriors (aggregation mean)
 python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
 python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
 python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
 python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
 ##FeatureSetToPosteriors
 #python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob
 #python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob
 #python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob
 #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
 #MajorityVoting
 #python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
 #python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
 #python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
 #python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r
--- a/src/experiment_scripts/run_dl_jrc.sh
+++ b/src/experiment_scripts/run_dl_jrc.sh
@ -0,0 +1,31 @@
 #!/usr/bin/env bash
 logfile=../log/log_pre_jrc.csv
 dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
 python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20
--- a/src/experiment_scripts/run_dl_rcv.sh
+++ b/src/experiment_scripts/run_dl_rcv.sh
@ -0,0 +1,30 @@
 #!/usr/bin/env bash
 dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
 python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --supervised --plotmode --test-each 20
 python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20
 python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20
 python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
 python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20
--- a/src/experiment_scripts/run_fulljrc_dl.sh
+++ b/src/experiment_scripts/run_fulljrc_dl.sh
@ -0,0 +1,16 @@
 dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
 seeds='5' #2 3 4 5 6 7 8 9 10'
 for seed in $seeds
 do
  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed
  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
  python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed  --force
  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained  --seed $seed
  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable  --seed $seed
  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained  --seed $seed
  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed
  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed  --force
 done
--- a/src/experiment_scripts/run_fullrcv_dl.sh
+++ b/src/experiment_scripts/run_fullrcv_dl.sh
@ -0,0 +1,20 @@
 dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
 seeds='1 ' #2 3 4 5' # 6 7 8 9 10'
 for seed in $seeds
 do
  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed
  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
  python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200
  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained  --seed $seed
  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable  --seed $seed
  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained  --seed $seed
  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed
 #  python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed
 #  python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200
  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed
 done
--- a/src/experiment_scripts/run_traditional_jrc.sh
+++ b/src/experiment_scripts/run_traditional_jrc.sh
@ -0,0 +1,45 @@
 #!/usr/bin/env bash
 dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
 ######################################## POSTERIORS
                                                                                  # Posteriors
 python main_multimodal_cls.py $dataset -P                                         # + zscore
 python main_multimodal_cls.py $dataset -P -z                                      # +l2norm
 python main_multimodal_cls.py $dataset -P -z --l2                                 # +feature weight
 ######################################### WCE
                                                                                  #WCE supervised
 python main_multimodal_cls.py $dataset -S                                         # + zscore
 python main_multimodal_cls.py $dataset -S -z                                      # +l2norm
 python main_multimodal_cls.py $dataset -S -z --l2                                 # +feature weight
 python main_multimodal_cls.py $dataset -S -z -r --l2                               # + SIF - PCA
 python main_multimodal_cls.py $dataset -S -z -p 250 --l2                           # +feature weight + pca
 python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2                        # + SIF
 python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig                # -feature weight
 python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
 python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig           # + pca
 python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig
 python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
 python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
 python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi
 python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi
 ################################# MUSE
                                                                                  # MUSE unsupervised
 python main_multimodal_cls.py $dataset -U                                         # + zscore
 python main_multimodal_cls.py $dataset -U -z                                      # +l2norm
 python main_multimodal_cls.py $dataset -U -z --l2                                 # +feature weight
 python main_multimodal_cls.py $dataset -U -z -r --l2                              # + SIF - PCA
 python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig                # -feature weight + pca
 python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
 python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
 python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi
--- a/src/experiment_scripts/run_traditional_rcv.sh
+++ b/src/experiment_scripts/run_traditional_rcv.sh
@ -0,0 +1,45 @@
 #!/usr/bin/env bash
 dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
 ######################################## POSTERIORS
                                                                                  # Posteriors
 python main_multimodal_cls.py $dataset -P                                         # + zscore
 python main_multimodal_cls.py $dataset -P -z                                      # +l2norm
 python main_multimodal_cls.py $dataset -P -z --l2                                 # +feature weight
 ######################################### WCE
                                                                                  #WCE supervised
 python main_multimodal_cls.py $dataset -S                                         # + zscore
 python main_multimodal_cls.py $dataset -S -z                                      # +l2norm
 python main_multimodal_cls.py $dataset -S -z --l2                                 # +feature weight
 python main_multimodal_cls.py $dataset -S -z -r --l2                               # + SIF - PCA
 python main_multimodal_cls.py $dataset -S -z -p 50 --l2                           # +feature weight + pca
 python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2                        # + SIF
 python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig                # -feature weight
 python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
 python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig           # + pca
 python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig
 python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
 python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
 python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi
 python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi
 ################################# MUSE
                                                                                  # MUSE unsupervised
 python main_multimodal_cls.py $dataset -U                                         # + zscore
 python main_multimodal_cls.py $dataset -U -z                                      # +l2norm
 python main_multimodal_cls.py $dataset -U -z --l2                                 # +feature weight
 python main_multimodal_cls.py $dataset -U -z -r --l2                              # + SIF - PCA
 python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig                # -feature weight + pca
 python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
 python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
 python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi
--- a/src/experiment_scripts/time_comparison.sh
+++ b/src/experiment_scripts/time_comparison.sh
@ -0,0 +1,6 @@
 dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
 seeds='1 2 3 4 5 6 7 8 9 10'
 for seed in $seeds
 do
  python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised  --nepochs 50 --seed $seed
  done
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@ -1,15 +1,15 @@
 import numpy as np
 import time
-from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
+# from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
 from scipy.sparse import issparse
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.model_selection import GridSearchCV
-from sklearn.model_selection import KFold
+# from sklearn.model_selection import KFold
 from joblib import Parallel, delayed
-from sklearn.feature_extraction.text import TfidfVectorizer
+# from sklearn.feature_extraction.text import TfidfVectorizer
-from transformers.StandardizeTransformer import StandardizeTransformer
+# from util_transformers.StandardizeTransformer import StandardizeTransformer
-from sklearn.decomposition import PCA
+# from sklearn.decomposition import PCA
-from models.cnn_class_bu import CNN_pdr
+# from models.cnn_class_bu import CNN_pdr
 def _sort_if_sparse(X):
@ -40,154 +40,154 @@ class TrivialRejector:
    def best_params(self): return {}
-class FunnellingPolylingualClassifier:
+# class FunnellingPolylingualClassifier:
-    """
+#     """
-    This classifier projects each document d into a language-independent feature space where each dimension fi is the
+#     This classifier projects each document d into a language-independent feature space where each dimension fi is the
-    decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
+#     decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
-    then trains one single classifier for all documents in this space, irrespective of their originary language
+#     then trains one single classifier for all documents in this space, irrespective of their originary language
-    """
+#     """
-    def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
+#     def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
-                 calmode='cal', n_jobs=-1):
+#                  calmode='cal', n_jobs=-1):
-        """
+#         """
-        :param first_tier_learner: the learner used in the first-tier level
+#         :param first_tier_learner: the learner used in the first-tier level
-        :param meta_learner: the learner used in the second-tier level
+#         :param meta_learner: the learner used in the second-tier level
-        :param first_tier_parameters: parameters for the learner in the doc_projector
+#         :param first_tier_parameters: parameters for the learner in the doc_projector
-        :param meta_parameters: parameters for the learner in the z-space
+#         :param meta_parameters: parameters for the learner in the z-space
-        :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
+#         :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
-        :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
+#         :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
-        :param n_jobs: number of parallel threads
+#         :param n_jobs: number of parallel threads
-        'sigmoid' to use the sigmoid of the decision_function
+#         'sigmoid' to use the sigmoid of the decision_function
-        projects the data before training the final classifier; if greater than one, the training set is split in as
+#         projects the data before training the final classifier; if greater than one, the training set is split in as
-        many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
+#         many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
-        models trained on the remaining folds. This should increase the generality of the space to unseen data.
+#         models trained on the remaining folds. This should increase the generality of the space to unseen data.
-        """
+#         """
-        assert folded_projections>0, "positive number of folds expected"
+#         assert folded_projections>0, "positive number of folds expected"
-        assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
+#         assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
-        assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
+#         assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
-
+#
-        self.fist_tier_learner = first_tier_learner
+#         self.fist_tier_learner = first_tier_learner
-        self.meta_learner = meta_learner
+#         self.meta_learner = meta_learner
-        self.fist_tier_parameters=first_tier_parameters
+#         self.fist_tier_parameters=first_tier_parameters
-        self.meta_parameters = meta_parameters
+#         self.meta_parameters = meta_parameters
-        self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
+#         self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
-        self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
+#         self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
-        self.folded_projections = folded_projections
+#         self.folded_projections = folded_projections
-        self.n_jobs = n_jobs
+#         self.n_jobs = n_jobs
-        self.calmode = calmode
+#         self.calmode = calmode
-
+#
-    def _projection(self, doc_projector, lX):
+#     def _projection(self, doc_projector, lX):
-        """
+#         """
-        Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
+#         Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
-        decision_function if otherwise
+#         decision_function if otherwise
-        :param doc_projector: the document projector (a NaivePolylingualClassifier)
+#         :param doc_projector: the document projector (a NaivePolylingualClassifier)
-        :param lX: {lang:matrix} to train
+#         :param lX: {lang:matrix} to train
-        :return: the projection, applied with predict_proba or decision_function
+#         :return: the projection, applied with predict_proba or decision_function
-        """
+#         """
-        if self.calmode=='cal':
+#         if self.calmode=='cal':
-            return doc_projector.predict_proba(lX)
+#             return doc_projector.predict_proba(lX)
-        else:
+#         else:
-            l_decision_scores = doc_projector.decision_function(lX)
+#             l_decision_scores = doc_projector.decision_function(lX)
-            if self.calmode=='sigmoid':
+#             if self.calmode=='sigmoid':
-                def sigmoid(x): return 1 / (1 + np.exp(-x))
+#                 def sigmoid(x): return 1 / (1 + np.exp(-x))
-                for lang in l_decision_scores.keys():
+#                 for lang in l_decision_scores.keys():
-                    l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
+#                     l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
-            return l_decision_scores
+#             return l_decision_scores
-
+#
-    def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
+#     def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
-        """
+#         """
-        Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
+#         Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
-        decision scores (if otherwise). This space is here named zspace.
+#         decision scores (if otherwise). This space is here named zspace.
-        :param lXtr: {lang:matrix} to train
+#         :param lXtr: {lang:matrix} to train
-        :param lYtr: {lang:labels} to train
+#         :param lYtr: {lang:labels} to train
-        :param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
+#         :param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
-        :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
+#         :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
-        :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
+#         :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
-        models trained on lXtr, and the lYproj labels stacked consistently
+#         models trained on lXtr, and the lYproj labels stacked consistently
-        """
+#         """
-        repair_empty_folds = True
+#         repair_empty_folds = True
-        if lXproj is None and lYproj is None:
+#         if lXproj is None and lYproj is None:
-            lXproj, lYproj = lXtr, lYtr
+#             lXproj, lYproj = lXtr, lYtr
-            repair_empty_folds = False
+#             repair_empty_folds = False
-
+#
-        print('fitting the projectors... {}'.format(lXtr.keys()))
+#         print('fitting the projectors... {}'.format(lXtr.keys()))
-        self.doc_projector.fit(lXtr, lYtr)
+#         self.doc_projector.fit(lXtr, lYtr)
-
+#
-        print('projecting the documents')
+#         print('projecting the documents')
-        langs = list(lXtr.keys())
+#         langs = list(lXtr.keys())
-        lZ = self._projection(self.doc_projector, lXproj)
+#         lZ = self._projection(self.doc_projector, lXproj)
-
+#
-        # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
+#         # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
-        empty_categories = self.doc_projector.empty_categories
+#         empty_categories = self.doc_projector.empty_categories
-        lZ_bu = self._projection(self.doc_projector_bu, lXproj)
+#         lZ_bu = self._projection(self.doc_projector_bu, lXproj)
-
+#
-        for lang in langs:
+#         for lang in langs:
-            repair = empty_categories[lang]
+#             repair = empty_categories[lang]
-            lZ[lang][:,repair] = lZ_bu[lang][:,repair]
+#             lZ[lang][:,repair] = lZ_bu[lang][:,repair]
-
+#
-        Z = np.vstack([lZ[lang] for lang in langs])  # Z is the language independent space
+#         Z = np.vstack([lZ[lang] for lang in langs])  # Z is the language independent space
-        zy = np.vstack([lYproj[lang] for lang in langs])
+#         zy = np.vstack([lYproj[lang] for lang in langs])
-        return Z, zy
+#         return Z, zy
-
+#
-    def _get_zspace_folds(self, lX, ly):
+#     def _get_zspace_folds(self, lX, ly):
-        self.doc_projector_bu.fit(lX, ly)
+#         self.doc_projector_bu.fit(lX, ly)
-
+#
-        print('split of {} folds'.format(self.folded_projections))
+#         print('split of {} folds'.format(self.folded_projections))
-        skf = KFold(n_splits=self.folded_projections, shuffle=True)
+#         skf = KFold(n_splits=self.folded_projections, shuffle=True)
-
+#
-        Z, zy = [], []
+#         Z, zy = [], []
-        lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
+#         lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
-        for fold in range(self.folded_projections):
+#         for fold in range(self.folded_projections):
-            print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
+#             print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
-            lfoldXtr, lfoldYtr = {}, {}
+#             lfoldXtr, lfoldYtr = {}, {}
-            lfoldXte, lfoldYte = {}, {}
+#             lfoldXte, lfoldYte = {}, {}
-            for lang in lX.keys():
+#             for lang in lX.keys():
-                train, test = lfold[lang][fold]
+#                 train, test = lfold[lang][fold]
-                lfoldXtr[lang] = lX[lang][train]
+#                 lfoldXtr[lang] = lX[lang][train]
-                lfoldYtr[lang] = ly[lang][train]
+#                 lfoldYtr[lang] = ly[lang][train]
-                lfoldXte[lang] = lX[lang][test]
+#                 lfoldXte[lang] = lX[lang][test]
-                lfoldYte[lang] = ly[lang][test]
+#                 lfoldYte[lang] = ly[lang][test]
-            Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
+#             Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
-            Z.append(Zfold)
+#             Z.append(Zfold)
-            zy.append(zYfold)
+#             zy.append(zYfold)
-        # compose the Z-space as the union of all folded predictions
+#         # compose the Z-space as the union of all folded predictions
-        Z = np.vstack(Z)
+#         Z = np.vstack(Z)
-        zy = np.vstack(zy)
+#         zy = np.vstack(zy)
-        # refit the document projector with all examples to have a more reliable projector for test data
+#         # refit the document projector with all examples to have a more reliable projector for test data
-        self.doc_projector = self.doc_projector_bu
+#         self.doc_projector = self.doc_projector_bu
-        return Z, zy
+#         return Z, zy
-
+#
-    def fit(self, lX, ly, lZ=None, lzy=None):
+#     def fit(self, lX, ly, lZ=None, lzy=None):
-        tinit = time.time()
+#         tinit = time.time()
-        Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
+#         Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
-
+#
-        #experimental: adds the posterior probabilities (computed outside) to the meta-classifier
+#         #experimental: adds the posterior probabilities (computed outside) to the meta-classifier
-        if lZ is not None and lzy is not None:
+#         if lZ is not None and lzy is not None:
-            zlangs = list(lZ.keys())
+#             zlangs = list(lZ.keys())
-            Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
+#             Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
-            zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
+#             zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
-
+#
-        print('fitting the Z-space of shape={}'.format(Z.shape))
+#         print('fitting the Z-space of shape={}'.format(Z.shape))
-        self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
+#         self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
-        self.model.fit(Z, zy)
+#         self.model.fit(Z, zy)
-        self.time = time.time() - tinit
+#         self.time = time.time() - tinit
-
+#
-        return self
+#         return self
-
+#
-    def predict(self, lX, lZ=None):
+#     def predict(self, lX, lZ=None):
-        """
+#         """
-        :param lX: a dictionary {language_label: X csr-matrix}
+#         :param lX: a dictionary {language_label: X csr-matrix}
-        :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
+#         :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
-        :return: a dictionary of predictions
+#         :return: a dictionary of predictions
-        """
+#         """
-        lZ_ = self._projection(self.doc_projector, lX)
+#         lZ_ = self._projection(self.doc_projector, lX)
-        if lZ is not None:
+#         if lZ is not None:
-            lZ_ = {**lZ_, **lZ}
+#             lZ_ = {**lZ_, **lZ}
-        return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
+#         return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
-
+#
-    def best_params(self):
+#     def best_params(self):
-        params = self.doc_projector.best_params()
+#         params = self.doc_projector.best_params()
-        params['meta'] = self.model.best_params()
+#         params['meta'] = self.model.best_params()
-        return params
+#         return params
 class NaivePolylingualClassifier:
@ -322,411 +322,4 @@ class MonolingualClassifier:
        return self.model.predict(X)
    def best_params(self):
-        return self.best_params_
+        return self.best_params_
 class FunnellingMultimodal(FunnellingPolylingualClassifier):
    def __init__(self,
                 we_path,
                 config,
                 first_tier_learner,
                 meta_learner,
                 first_tier_parameters=None,
                 meta_parameters=None,
                 folded_projections=1,
                 calmode='cal',
                 n_jobs=-1):
        super().__init__(first_tier_learner,
                         meta_learner,
                         first_tier_parameters,
                         meta_parameters,
                         folded_projections,
                         calmode,
                         n_jobs)
        self.pca_independent_space = PCA(n_components=50)
        self.we_path = we_path
        self.config = config
        self.lang_word2idx = dict()
        self.languages = []
        self.lang_tfidf = {}
        self.embedding_space = None
        self.model = None
        self.time = None
        self.best_components = 'not set'    # if auto optimize pca, it will store the optimal number of components
    def vectorize(self, lX, prediction=False):
        langs = list(lX.keys())
        print(f'# tfidf-vectorizing docs')
        if prediction:
            for lang in langs:
                assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
                tfidf_vectorizer = self.lang_tfidf[lang]
                lX[lang] = tfidf_vectorizer.transform(lX[lang])
            return self
        for lang in langs:
            tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
            self.languages.append(lang)
            tfidf_vectorizer.fit(lX[lang])
            lX[lang] = tfidf_vectorizer.transform(lX[lang])
            self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
            self.lang_tfidf[lang] = tfidf_vectorizer
        return self
    def _get_zspace(self, lXtr, lYtr):
        print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
        self.doc_projector.fit(lXtr, lYtr)
        print('\nprojecting the documents')
        lZ = self._projection(self.doc_projector, lXtr)
        return lZ, lYtr
    def fit(self, lX, ly):
        tinit = time.time()
        print('Vectorizing documents...')
        self.vectorize(lX)
        for lang in self.languages:
            print(f'{lang}->{lX[lang].shape}')
        Z, zy = self._get_zspace(lX, ly)
        if self.config['supervised'] or self.config['unsupervised']:
            self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
            _embedding_space = self.embedding_space.transform(self.config, lX)
            if self.config['max_label_space'] == 0:
                _cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1]
                if _cum_dimension - 300 > 0:
                    _temp = _cum_dimension - 300
                else:
                    _temp = _cum_dimension
                self.best_components = _temp
            # h_stacking posterior probabilities with (U) and/or (S) matrices
            for lang in self.languages:
                Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
        # stacking Z space vertically
        _vertical_Z = np.vstack([Z[lang] for lang in self.languages])
        _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
        self.standardizer = StandardizeTransformer()
        _vertical_Z = self.standardizer.fit_transform(_vertical_Z)
        # todo testing ...
        # if self.config['post_pca']:
        #     print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
        #     self.pca_independent_space.fit(_vertical_Z)
        #     _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
        print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
        self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
                                           n_jobs=self.n_jobs)
        self.model.fit(_vertical_Z, _vertical_Zy)
        self.time = time.time() - tinit
        print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
    def predict(self, lX, ly):
        print('Vectorizing documents')
        self.vectorize(lX, prediction=True)
        lZ = self._projection(self.doc_projector, lX)
        if self.config['supervised'] or self.config['unsupervised']:
            _embedding_space = self.embedding_space.transform(self.config, lX)
            for lang in lX.keys():
                lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
        for lang in lZ.keys():
            print(lZ[lang].shape)
            # todo testing
            lZ[lang] = self.standardizer.transform(lZ[lang])
            # if self.config['post_pca']:
            #     print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
            #     lZ[lang] = self.pca_independent_space.transform(lZ[lang])
        return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
 class PolylingualEmbeddingsClassifier:
    """
    This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article
    @article{conneau2017word,
      title={Word translation without parallel data},
      author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}},
      journal={arXiv preprint arXiv:1710.04087},
      year={2017}
    }
    url: https://github.com/facebookresearch/MUSE
    """
    def __init__(self, wordembeddings_path, config, learner, c_parameters=None, n_jobs=-1):
        """
        :param wordembeddings_path: the path to the directory containing the polylingual embeddings
        :param learner: the learner
        :param c_parameters: parameters for learner
        :param n_jobs: the number of concurrent threads
        """
        self.wordembeddings_path = wordembeddings_path
        self.config = config
        self.learner = learner
        self.c_parameters=c_parameters
        self.n_jobs = n_jobs
        self.lang_tfidf = {}
        self.model = None
        self.languages = []
        self.lang_word2idx = dict()
        self.embedding_space = None
    def fit_vectorizers(self, lX):
        for lang in lX.keys():
            if lang not in self.lang_tfidf:
                tfidf = TfidfVectorizer(sublinear_tf=True, use_idf=True)  # text is already processed
                docs = lX[lang]
                tfidf.fit(docs)
                self.lang_tfidf[lang] = tfidf
    def vectorize(self, lX, prediction=False):
        langs = list(lX.keys())
        print(f'# tfidf-vectorizing docs')
        if prediction:
            for lang in langs:
                assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
                tfidf_vectorizer = self.lang_tfidf[lang]
                lX[lang] = tfidf_vectorizer.transform(lX[lang])
            return self
        for lang in langs:
            tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
            self.languages.append(lang)
            tfidf_vectorizer.fit(lX[lang])
            lX[lang] = tfidf_vectorizer.transform(lX[lang])
            self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
            self.lang_tfidf[lang] = tfidf_vectorizer
        return self
    def embed(self, docs, lang):
        assert lang in self.lang_tfidf, 'unknown language'
        tfidf_vectorizer = self.lang_tfidf[lang]
        V = tfidf_vectorizer.vocabulary_
        Xweights = tfidf_vectorizer.transform(docs)
        print('loading word embeddings for ' + lang)
        we = WordEmbeddings.load(self.wordembeddings_path, lang)
        nD = len(docs)
        doc_vecs = np.zeros((nD, we.dim()))
        for i, doc in enumerate(docs):
            print('\r\tcomplete {:.3f}%'.format(100 * (i + 1) / nD), end='')
            # averaging with tfidf (summing each word only once, since the frequency is already controlled)
            for w in set(doc.split()):
                if w in we and w in V:
                    doc_vecs[i] += (we[w] * Xweights[i, V[w]])
            # works much worse with idf; works much worse with document l2-normalization
        print()
        return doc_vecs
    def fit(self, lX, ly):
        """
        :param lX: a dictionary {language_label: [list of preprocessed documents]}
        :param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
        :return: self
        """
        tinit = time.time()
        langs = list(lX.keys())
        WEtr, Ytr = [], []
        # self.fit_vectorizers(lX) # if already fit, does nothing
        self.vectorize(lX)
        # config = {'unsupervised' : False, 'supervised': True}
        self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX,  self.lang_word2idx, ly)
        WEtr = self.embedding_space.transform(self.config, lX)
        # for lang in langs:
        #     WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices
        #     Ytr.append(ly[lang])
        WEtr = np.vstack([WEtr[lang] for lang in langs])
        Ytr = np.vstack([ly[lang] for lang in langs])
        self.embed_time = time.time() - tinit
        print('fitting the WE-space of shape={}'.format(WEtr.shape))
        self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs)
        self.model.fit(WEtr, Ytr)
        self.time = time.time() - tinit
        return self
    def predict(self, lX, lY):
        """
        :param lX: a dictionary {language_label: [list of preprocessed documents]}
        """
        assert self.model is not None, 'predict called before fit'
        self.vectorize(lX, prediction=True)
        langs = list(lX.keys())
        lWEte = self.embedding_space.transform(self.config, lX)
        # lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
        return _joblib_transform_multiling(self.model.transform, lWEte, n_jobs=self.n_jobs)
    def predict_proba(self, lX):
        """
        :param lX: a dictionary {language_label: [list of preprocessed documents]}
        """
        assert self.model is not None, 'predict called before fit'
        langs = list(lX.keys())
        lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
        return _joblib_transform_multiling(self.model.predict_proba, lWEte, n_jobs=self.n_jobs)
    def best_params(self):
        return self.model.best_params()
 class MonolingualNetSvm:
    """
    testing: funnelling with NN managing word embeddings compositionality. An ensemble of n-SVMs (n equals to the
    number of training languages) is first fit on the data, generating the documents projection in the Z-space. Next,
    the projection are fed to a single NN with their respective document embeddings. The documents are projected into
    the embedding space while preserving their dimensionality (output dim is 300). These projection are horizonatally
    concatenated with the respective projection and passed through a fC layer with sigmoid act and output dim equal
    to the number of target classes.
    # TODO ATM testing with only 1 language
    """
    def __init__(self, lX, ly, first_tier_learner, first_tier_parameters, n_jobs):
        self.lX = lX
        self.ly = ly
        # SVM Attributes
        self.doc_projector = NaivePolylingualClassifier(first_tier_learner, first_tier_parameters,
                                                        n_jobs=n_jobs)
        self.calmode = 'cal'
        self.languages = []
        self.lang_word2idx = dict()
        self.lang_tfidf = {}
        self.base_learner = 'TODO'
        self.parameters = 'TODO'
        # NN Attributes
        self.NN = 'TODO'
    def load_preprocessed(self):
        """
        in order to speed up the process, documents are first tokenized in the "main". Here, tokenized docs, word_index, and
        targets are loaded.
        :return: dict[lang] = (word_index, tokenized_docs, targets)
        """
        import pickle
        with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
            return pickle.load(f)
    def _build_embedding_matrix(self, lang, word_index):
        """
        build embedding matrix by filtering out OOV embeddings
        :param lang:
        :param word_index:
        :return: filtered embedding matrix
        """
        from embeddings.embeddings import EmbeddingsAligned
        type = 'MUSE'
        path = '/home/andreapdr/CLESA/'
        MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
        return MUSE
    def get_data_and_embed(self, data_dict):
        from keras.preprocessing.sequence import pad_sequences
        langs = data_dict.keys()
        lang_embedding_matrix = dict()
        nn_lXtr = dict()
        nn_lytr = dict()
        for lang in langs:
            lang_embedding_matrix[lang] = self._build_embedding_matrix(lang, data_dict[lang][0])
            nn_lXtr[lang] = pad_sequences(data_dict[lang][1], 100, padding='post')
            nn_lytr[lang] = [data_dict[lang][2]]
        return  nn_lXtr, nn_lytr, lang_embedding_matrix
    def svm_vectorize(self, lX, prediction=False):
        langs = list(lX.keys())
        print(f'# tfidf-vectorizing docs')
        if prediction:
            for lang in langs:
                assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
                tfidf_vectorizer = self.lang_tfidf[lang]
                lX[lang] = tfidf_vectorizer.transform(lX[lang])
            return self
        for lang in langs:
            tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
            self.languages.append(lang)
            tfidf_vectorizer.fit(lX[lang])
            lX[lang] = tfidf_vectorizer.transform(lX[lang])
            self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
            self.lang_tfidf[lang] = tfidf_vectorizer
        return lX
    def _get_zspace(self, lXtr, lYtr):
        print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
        self.doc_projector.fit(lXtr, lYtr)
        print('\nprojecting the documents')
        lZ = self._projection(self.doc_projector, lXtr)
        return lZ, lYtr
    def _projection(self, doc_projector, lX):
        """
        Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
        decision_function if otherwise
        :param doc_projector: the document projector (a NaivePolylingualClassifier)
        :param lX: {lang:matrix} to train
        :return: the projection, applied with predict_proba or decision_function
        """
        if self.calmode=='cal':
            return doc_projector.predict_proba(lX)
        else:
            l_decision_scores = doc_projector.decision_function(lX)
            if self.calmode=='sigmoid':
                def sigmoid(x): return 1 / (1 + np.exp(-x))
                for lang in l_decision_scores.keys():
                    l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
            return l_decision_scores
    def fit(self):
        """
        # 1. Fit SVM to generate posterior probabilities:
        #   1.1 Gather documents and vectorize them as in other SVM classifiers
        # 2. Fit NN
        #   2.1 Gather documents and build NN dataset by indexing wrt embedding matrix
        #   2.2 Fit NN first-layer to generate compositional doc embedding
        #   2.3 H-stack doc-embed and posterior P
        #   2.4 Feed stacked vector to output layer (sigmoid act): output Nc
        #   2.5 Train it...
        """
        # load pre-processed data
        data_dict = self.load_preprocessed()
        # build embedding matrices and neural network document training set
        nn_lXtr, nn_lytr, lang_embedding_matrix = self.get_data_and_embed(data_dict)
        # TF-IDF vectorzing documents for SVM classifier
        svm_lX = self.svm_vectorize(self.lX)
        # just testing on a smaller subset of data
        test_svm_lX = dict()
        test_svm_ly = dict()
        test_svm_lX['it'] = svm_lX['it'][:10, :]
        test_svm_ly['it'] = self.ly['it'][:10, :]
        test_nn_data = nn_lXtr['it'][:10]
        # projecting document into Z space by SVM
        svm_Z, _ = self._get_zspace(test_svm_lX, test_svm_ly)
        # initializing net and forward pass
        net = CNN_pdr(73, 1, 300, len(lang_embedding_matrix['it'].vectors), 300, lang_embedding_matrix['it'].vectors)
        out = net.forward(test_nn_data, svm_Z['it'])
        print('TODO')
    def net(self):
        pass
--- a/src/learning/transformers.py
+++ b/src/learning/transformers.py
@ -10,7 +10,7 @@ import time
 from sklearn.decomposition import PCA
 from joblib import Parallel, delayed
 from scipy.sparse import issparse, vstack, hstack
-from transformers.StandardizeTransformer import StandardizeTransformer
+from util_transformers.StandardizeTransformer import StandardizeTransformer
 from util.SIF_embed import remove_pc
 from sklearn.preprocessing import normalize
 from sklearn.svm import SVC
@ -127,22 +127,26 @@ class PosteriorProbabilitiesEmbedder:
        print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents')
        return self.doc_projector.predict_proba(lX)
    def _get_output_dim(self):
        return len(self.doc_projector.model['da'].model.classes_)
 class MuseEmbedder:
-    def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight()):
+    def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False):
        self.path=path
        self.lV = lV
        self.l2 = l2
        self.n_jobs = n_jobs
        self.featureweight = featureweight
        self.sif = sif
    def fit(self, lX, ly, lV=None):
        assert lV is not None or self.lV is not None, 'lV not specified'
        self.langs = sorted(lX.keys())
        self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
        lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
-        self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE}
+        self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE.items()}
        self.featureweight.fit(lX, ly)
        return self
@ -150,7 +154,7 @@ class MuseEmbedder:
        MUSE = self.MUSE
        lX = self.featureweight.transform(lX)
        XdotMUSE = Parallel(n_jobs=self.n_jobs)(
-            delayed(XdotM)(lX[lang], MUSE[lang]) for lang in self.langs
+            delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs
        )
        lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
        lMuse = _normalize(lMuse, self.l2)
@ -162,14 +166,18 @@ class MuseEmbedder:
    def _get_wordlist_from_word2index(self, word2index):
        return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0]
    def _get_output_dim(self):
        return self.MUSE['da'].shape[1]
 class WordClassEmbedder:
-    def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight()):
+    def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False):
        self.n_jobs = n_jobs
        self.l2 = l2
        self.max_label_space=max_label_space
        self.featureweight = featureweight
        self.sif = sif
    def fit(self, lX, ly, lV=None):
        self.langs = sorted(lX.keys())
@ -184,7 +192,7 @@ class WordClassEmbedder:
        lWCE = self.lWCE
        lX = self.featureweight.transform(lX)
        XdotWCE = Parallel(n_jobs=self.n_jobs)(
-            delayed(XdotM)(lX[lang], lWCE[lang])for lang in self.langs
+            delayed(XdotM)(lX[lang], lWCE[lang], self.sif)for lang in self.langs
        )
        lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
        lwce = _normalize(lwce, self.l2)
@ -193,6 +201,9 @@ class WordClassEmbedder:
    def fit_transform(self, lX, ly, lV=None):
        return self.fit(lX, ly).transform(lX)
    def _get_output_dim(self):
        return 73
 class DocEmbedderList:
@ -201,6 +212,7 @@ class DocEmbedderList:
        if len(embedder_list)==0: embedder_list=[]
        self.embedders = embedder_list
        self.aggregation = aggregation
        print(f'Aggregation mode: {self.aggregation}')
    def fit(self, lX, ly, lV=None):
        for transformer in self.embedders:
@ -238,16 +250,25 @@ class DocEmbedderList:
        langs = sorted(lX.keys())
        lZparts = {l: None for l in langs}
        # min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
        min_dim = 300
        for transformer in self.embedders:
            lZ = transformer.transform(lX)
            nC = min([lZ[lang].shape[1] for lang in langs])
            for l in langs:
                Z = lZ[l]
                if Z.shape[1] > min_dim:
                    print(f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
                          f'Applying PCA(n_components={min_dim})')
                    pca = PCA(n_components=min_dim)
                    Z = pca.fit(Z).transform(Z)
                if lZparts[l] is None:
                    lZparts[l] = Z
                else:
                    lZparts[l] += Z
        n_transformers = len(self.embedders)
        nC = min([lZparts[lang].shape[1] for lang in langs])
        return {l:lZparts[l] / n_transformers for l in langs}
@ -266,7 +287,7 @@ class FeatureSet2Posteriors:
        self.transformer = transformer
        self.l2=l2
        self.n_jobs = n_jobs
-        self.prob_classifier = MetaClassifier(SVC(kernel='rbf', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
+        self.prob_classifier = MetaClassifier(SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
    def fit(self, lX, ly, lV=None):
        if lV is None and hasattr(self.transformer, 'lV'):
@ -412,11 +433,13 @@ def word_class_embedding_matrix(X, Y, max_label_space=300):
    return WCE
-def XdotM(X,M):
+def XdotM(X,M, sif):
    # return X.dot(M)
-    # print(f'X={X.shape}, M={M.shape}')
+    print(f'X={X.shape}, M={M.shape}')
    E = X.dot(M)
-    E = remove_pc(E, npc=1)
+    if sif:
        print("removing pc...")
        E = remove_pc(E, npc=1)
    return E
--- a/src/main_deep.py
+++ b/src/main_deep.py
@ -1,92 +0,0 @@
 from optparse import OptionParser
 from util.results import PolylingualClassificationResults
 from dataset_builder import MultilingualDataset
 from keras.preprocessing.text import Tokenizer
 from learning.learners import MonolingualNetSvm
 from sklearn.svm import SVC
 import pickle
 parser = OptionParser()
 parser.add_option("-d", "--dataset", dest="dataset",
                  help="Path to the multilingual dataset processed and stored in .pickle format",
                  default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
 parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
                  help="Optimize hyperparameters", default=False)
 parser.add_option("-s", "--set_c", dest="set_c",type=float,
                  help="Set the C parameter", default=1)
 (op, args) = parser.parse_args()
 ###################################################################################################################
 def get_learner(calibrate=False, kernel='linear'):
    return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
 def get_params(dense=False):
    if not op.optimc:
        return None
    c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
    kernel = 'rbf' if dense else 'linear'
    return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
 # PREPROCESS TEXT AND SAVE IT ... both for SVM and NN
 def preprocess_data(lXtr, lXte, lytr, lyte):
    tokenized_tr = dict()
    tokenized_te = dict()
    for lang in lXtr.keys():
        alltexts = ' '.join(lXtr[lang])
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(alltexts.split(' '))
        tokenizer.oov_token = len(tokenizer.word_index)+1
        # dumping train set
        sequences_tr = tokenizer.texts_to_sequences(lXtr[lang])
        tokenized_tr[lang] = (tokenizer.word_index, sequences_tr, lytr[lang])
        # dumping test set
        sequences_te = tokenizer.texts_to_sequences(lXte[lang])
        tokenized_te[lang] = (tokenizer.word_index, sequences_te, lyte[lang])
    with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'wb') as f:
        pickle.dump(tokenized_tr, f)
    with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_test.pickle', 'wb') as f:
        pickle.dump(tokenized_tr, f)
    print('Successfully dumped data')
 # def load_preprocessed():
 #     with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
 #         return pickle.load(f)
 #
 # def build_embedding_matrix(lang, word_index):
 #     type = 'MUSE'
 #     path = '/home/andreapdr/CLESA/'
 #     MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
 #     return MUSE
 ########## MAIN #################################################################################################
 if __name__ == '__main__':
    results = PolylingualClassificationResults('./results/NN_FPEC_results.csv')
    data = MultilingualDataset.load(op.dataset)
    lXtr, lytr = data.training()
    lXte, lyte = data.test()
    if op.set_c != -1:
        meta_parameters = None
    else:
        meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
    test_architecture = MonolingualNetSvm(lXtr,
                                          lytr,
                                          first_tier_learner=get_learner(calibrate=True),
                                          first_tier_parameters=None,
                                          n_jobs=1)
    test_architecture.fit()
--- a/src/main_deep_learning.py
+++ b/src/main_deep_learning.py
@ -1,6 +1,6 @@
 import argparse
 import torch.nn as nn
-from torch.optim.lr_scheduler import StepLR
+from torch.optim.lr_scheduler import StepLR, MultiStepLR
 from dataset_builder import MultilingualDataset
 from learning.transformers import load_muse_embeddings
 from models.lstm_class import RNNMultilingualClassifier
@ -9,8 +9,6 @@ from util.early_stop import EarlyStopping
 from util.common import *
 from util.file import create_if_not_exist
 from time import time
 from embeddings.pretrained import *
 from os.path import join
 from tqdm import tqdm
 from util.evaluation import evaluate
 from util.file import get_file_name
@ -100,7 +98,7 @@ def main():
    # Loading the dataset
    data = MultilingualDataset.load(opt.dataset)
-    # data.set_view(languages=['de', 'fr', 'sv', 'da', 'es', 'it'])
+    data.set_view(languages=['de', 'fr']) #, 'it', 'en']) # 'sv', 'da', 'es', 'it'])
    data.show_dimensions()
    langs = data.langs()
    l_devel_raw, l_devel_target = data.training(target_as_csr=True)
@ -108,6 +106,7 @@ def main():
    # Loading the MUSE pretrained embeddings (only if requested)
    lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
    # lpretrained_vocabulary = none_dict(langs)   # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set
    # Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
    multilingual_index = MultilingualIndex()
@ -115,10 +114,26 @@ def main():
    multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
    multilingual_index.embedding_matrices(lpretrained, opt.supervised)
    if opt.posteriors:
-        lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=opt.svm_max_docs)
+        lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000, store_posteriors=True) #stored_post=True) #opt.svm_max_docs)
    else:
        lPtr, lPva, lPte = None, None, None
    # just_test = False
    # if just_test:
    #
    #     model = torch.load(
    #         '../checkpoint/rnn(H512)-Muse-WCE-Posteriors-(trainable)-jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')
    #     criterion = torch.nn.BCEWithLogitsLoss().cuda()
    #
    #     # batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
    #
    #     batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
    #     l_test_index = multilingual_index.l_test_index()
    #     epoch = 1
    #     tinit = time()
    #     test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
    #     exit('Loaded')
    # Model initialization
    model = init_Net(data.num_categories(), multilingual_index)
@ -130,7 +145,7 @@ def main():
    tinit = time()
    create_if_not_exist(opt.checkpoint_dir)
-    early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
+    early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
    l_train_index, l_train_target = multilingual_index.l_train()
    l_val_index, l_val_target = multilingual_index.l_val()
@ -155,7 +170,6 @@ def main():
                break
    # training is over
    # restores the best model according to the Mf1 of the validation set (only when plotmode==False)
    # stoptime = early_stop.stop_time - tinit
    # stopepoch = early_stop.best_epoch
@ -164,6 +178,8 @@ def main():
    if opt.plotmode==False:
        print('-' * 80)
        print('Training over. Performing final evaluation')
        # torch.cuda.empty_cache()
        model = early_stop.restore_checkpoint()
        if opt.val_epochs>0:
@ -183,10 +199,14 @@ def get_lr(optimizer):
 def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name):
    _dataset_path = opt.dataset.split('/')[-1].split('_')
    dataset_id = _dataset_path[0] + _dataset_path[-1]
    loss_history = []
    model.train()
    for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)):
        optim.zero_grad()
        _out = model(batch,post, lang)
        loss = criterion(model(batch, post, lang), target)
        loss.backward()
        clip_gradient(model)
@ -195,7 +215,7 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,
        if idx % opt.log_interval == 0:
            interval_loss = np.mean(loss_history[-opt.log_interval:])
-            print(f'{opt.dataset} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
+            print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
    mean_loss = np.mean(interval_loss)
    logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
@ -203,6 +223,8 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,
 def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix):
    loss_history = []
    model.eval()
    langs = sorted(ltest_index.keys())
    predictions = {l:[] for l in langs}
@ -214,6 +236,7 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
        prediction = predict(logits)
        predictions[lang].append(prediction)
        yte_stacked[lang].append(target.detach().cpu().numpy())
        loss_history.append(loss)
    ly  = {l:np.vstack(yte_stacked[l]) for l in langs}
    ly_ = {l:np.vstack(predictions[l]) for l in langs}
@ -224,17 +247,15 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
        metrics.append([macrof1, microf1, macrok, microk])
        if measure_prefix=='te':
            print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
        # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
        #                 (config['max_label_space'], classifier.best_components),
        #                 config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
        #                 lang, macrof1, microf1, macrok, microk, '')
    Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
    print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
-    # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=tend)
+    mean_loss = np.mean(loss_history)
-    # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mf1, timelapse=tend)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
-    # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-accuracy', value=acc, timelapse=tend)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
-    # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=loss, timelapse=tend)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
    return Mf1
--- a/src/main_majorityvoting_cls.py
+++ b/src/main_majorityvoting_cls.py
@ -1,7 +1,7 @@
 import os
 from dataset_builder import MultilingualDataset
 # from learning.learners import *
-from learning.learners import FunnellingMultimodal
+# from learning.learners import FunnellingMultimodal
 from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
    TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting
 from util.evaluation import *
@ -14,14 +14,14 @@ from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
 parser = OptionParser()
-parser.add_option("-d", "--dataset", dest="dataset",
+# parser.add_option("-d", "--dataset", dest="dataset",
-                  help="Path to the multilingual dataset processed and stored in .pickle format",
+#                   help="Path to the multilingual dataset processed and stored in .pickle format",
-                  default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
+#                   default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
 parser.add_option("-o", "--output", dest="output",
                  help="Result file", type=str,  default='./results/results.csv')
-parser.add_option("-P", "--probs", dest="probs", action='store_true',
+parser.add_option("-P", "--probs", dest="posteriors", action='store_true',
                  help="Add posterior probabilities to the document embedding representation", default=False)
 parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
@ -46,6 +46,9 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
                  help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
                  default=300)
 parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
                  help="Remove common component when computing dot product of word embedding matrices", default=False)
 # parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
 #                   help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
 #                        " If set to 0 it will automatically search for the best number of components", default=300)
@ -72,15 +75,18 @@ def get_params(dense=False):
 if __name__ == '__main__':
    (op, args) = parser.parse_args()
-    assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
+    assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
-    assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
+    dataset = args[0]
    assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
-    dataset_file = os.path.basename(op.dataset)
+    assert exists(dataset), 'Unable to find file '+str(dataset)
    assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
    assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
    dataset_file = os.path.basename(dataset)
    results = PolylingualClassificationResults(op.output)
-    data = MultilingualDataset.load(op.dataset)
+    data = MultilingualDataset.load(dataset)
    data.show_dimensions()
    lXtr, lytr = data.training()
@ -88,8 +94,9 @@ if __name__ == '__main__':
    meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
-    result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
+    # result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
-
+    result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \
                f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}'
    print(f'{result_id}')
    # text preprocessing
@ -100,7 +107,7 @@ if __name__ == '__main__':
    lV = tfidfvectorizer.vocabulary()
    classifiers = []
-    if op.probs:
+    if op.posteriors:
        classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
    if op.supervised:
        classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S)))
@ -115,13 +122,37 @@ if __name__ == '__main__':
    print('\n# Evaluating ...')
    l_eval = evaluate_method(classifier, lXte, lyte)
    # renaming arguments to be printed on log
    _id = ''
    _id_conf = [op.posteriors, op.supervised, op.pretrained]
    _id_name = ['+P', '+W', '+M']
    for i, conf in enumerate(_id_conf):
        if conf:
            _id += _id_name[i]
    _id = _id.lstrip('+')
    _dataset_path = dataset.split('/')[-1].split('_')
    dataset_id = _dataset_path[0] + _dataset_path[-1]
    metrics = []
    for lang in lXte.keys():
        macrof1, microf1, macrok, microk = l_eval[lang]
        metrics.append([macrof1, microf1, macrok, microk])
        print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
-        # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
+        results.add_row(method='Voting',
-        #                 (config['max_label_space'], classifier.best_components),
+                        learner='svm',
-        #                 config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
+                        optimp=op.optimc,
-        #                 lang, macrof1, microf1, macrok, microk, '')
+                        sif=op.sif,
                        zscore='todo',
                        l2='todo',
                        wescaler='todo',
                        pca=op.max_labels_S,
                        id=_id,
                        dataset=dataset_id,
                        time='todo',
                        lang=lang,
                        macrof1=macrof1,
                        microf1=microf1,
                        macrok=macrok,
                        microk=microk,
                        notes='')
    print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
--- a/src/main_multimodal_cls.py
+++ b/src/main_multimodal_cls.py
@ -11,7 +11,7 @@ from sklearn.svm import SVC
 parser = OptionParser(usage="usage: %prog datapath [options]")
 parser.add_option("-o", "--output", dest="output",
-                  help="Result file", type=str,  default='./results/results.csv')
+                  help="Result file", type=str,  default='multiModal_log.csv')
 parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true',
                  help="Add posterior probabilities to the document embedding representation", default=False)
@ -22,8 +22,8 @@ parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
 parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
                  help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
-parser.add_option("--nol2", dest="nol2", action='store_true',
+parser.add_option("--l2", dest="l2", action='store_true',
-                  help="Deactivates l2 normalization as a post-processing for the document embedding views", default=False)
+                  help="Activates l2 normalization as a post-processing for the document embedding views", default=False)
 parser.add_option("--allprob", dest="allprob", action='store_true',
                  help="All views are generated as posterior probabilities. This affects the supervised and pretrained "
@ -48,11 +48,28 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
                  help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
                  default=300)
 parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
                  help="Remove common component when computing dot product of word embedding matrices", default=False)
 parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
                  help="Z-score normalize matrices (WCE and MUSE)", default=False)
 parser.add_option("-a", "--agg", dest="agg", action='store_true',
                  help="Set aggregation function of the common Z-space to average (Default: concatenation)", default=False)
 def get_learner(calibrate=False, kernel='linear'):
    return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
 def get_params():
    if not op.optimc:
        return None
    c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
    kernel = 'rbf'
    return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
 #######################################################################################################################
@ -64,17 +81,23 @@ if __name__ == '__main__':
    assert exists(dataset), 'Unable to find file '+str(dataset)
    assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
    assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
-    l2=(op.nol2==False)
+    l2=op.l2
    dataset_file = os.path.basename(dataset)
-    results = PolylingualClassificationResults(op.output)
+    results = PolylingualClassificationResults('../log/' + op.output)
    allprob='Prob' if op.allprob else ''
    result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \
-        f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}{"_optimC" if op.optimc else ""}'
+        f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}_zscore={op.zscore}{"_optimC" if op.optimc else ""}'
    print(f'{result_id}')
    # set zscore range - is slice(0,0) mean will be equal to 0 and std to 1, thus normalization will have no effect
    standardize_range = slice(0,0)
    if op.zscore:
        standardize_range = None
    data = MultilingualDataset.load(dataset)
    # data.set_view(languages=['fr', 'it'])
    data.show_dimensions()
    lXtr, lytr = data.training()
    lXte, lyte = data.test()
@ -86,23 +109,23 @@ if __name__ == '__main__':
    feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
    # # document embedding modules
-    doc_embedder = DocEmbedderList(aggregation='concat')
+    doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat')
    if op.posteriors:
        doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2))
    if op.supervised:
-        wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting)
+        wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
        if op.allprob:
            wce = FeatureSet2Posteriors(wce, l2=l2)
        doc_embedder.append(wce)
    if op.pretrained:
-        muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting)
+        muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
        if op.allprob:
            muse = FeatureSet2Posteriors(muse, l2=l2)
        doc_embedder.append(muse)
    # metaclassifier
    meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
-    meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=meta_parameters)
+    meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=get_params(), standardize_range=standardize_range)
    # ensembling the modules
    classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
@ -113,13 +136,40 @@ if __name__ == '__main__':
    print('\n# Evaluating ...')
    l_eval = evaluate_method(classifier, lXte, lyte)
    # renaming arguments to be printed on log
    _id = ''
    _id_conf = [op.posteriors, op.supervised, op.pretrained]
    _id_name = ['+P', '+W', '+M']
    for i, conf in enumerate(_id_conf):
        if conf:
            _id += _id_name[i]
    _id = _id.lstrip('+')
    _id = _id if not op.agg else _id + '_mean'
    _id = _id if not op.allprob else _id + '_allprob'
    _dataset_path = dataset.split('/')[-1].split('_')
    dataset_id = _dataset_path[0] + _dataset_path[-1]
    metrics = []
    for lang in lXte.keys():
        macrof1, microf1, macrok, microk = l_eval[lang]
        metrics.append([macrof1, microf1, macrok, microk])
        print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
-        # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
+        results.add_row(method='MultiModal',
-        #                 (config['max_label_space'], classifier.best_components),
+                        learner='svm',
-        #                 config['dim_reduction_unsupervised'], op.optimc, dataset.split('/')[-1], classifier.time,
+                        optimp=op.optimc,
-        #                 lang, macrof1, microf1, macrok, microk, '')
+                        sif= op.sif,
                        zscore=op.zscore,
                        l2= op.l2,
                        wescaler= op.feat_weight,
                        pca=op.max_labels_S,
                        id=_id,
                        dataset=dataset_id,
                        time='todo',
                        lang=lang,
                        macrof1=macrof1,
                        microf1=microf1,
                        macrok=macrok,
                        microk=microk,
                        notes='')
    print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
--- a/src/models/lstm_class.py
+++ b/src/models/lstm_class.py
@ -27,7 +27,7 @@ class RNNMultilingualClassifier(nn.Module):
        self.n_layers = 1
        self.n_directions = 1
-        self.dropout = nn.Dropout(0.2)
+        self.dropout = nn.Dropout(0.6)
        lstm_out = 256
        ff1 = 512
@ -45,7 +45,7 @@ class RNNMultilingualClassifier(nn.Module):
                llearnable_embeddings[l] = learnable_embeddings
                self.embedding_length = embedding_length
-            # self.rnn = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
+            # self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
            self.rnn = nn.GRU(self.embedding_length, hidden_size)
            self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
            self.lpretrained_embeddings.update(lpretrained_embeddings)
--- a/src/new_mbert.py
+++ b/src/new_mbert.py
@ -0,0 +1,355 @@
 """
 Test with smaller subset of languages.
 1. Load doc (RCV1/2)
 2. Tokenize texts via bertTokenizer (I should already have these dumps)
 3. Construct better Dataloader/Datasets. NB: I need to keep track of the languages only for
 the testing phase (but who cares actually? If I have to do it for the testing phase, I think
 it is better to deploy it also in the training phase...)
 4. ...
 5. I have to understand if the pooled hidden state of the last layer is way worse than its averaged
 version (However, in BertForSeqClassification I guess that the pooled version is passed through
 the output linear layer in order to get the prediction scores?)
 6. At the same time, I have to build also an end-to-end model in order to fine-tune it. The previous step
 would be useful when deploying mBert as a View Generator. (Refactor gFun code with view generators?)
 7. ...
 8. Profits
 """
 from dataset_builder import MultilingualDataset
 from transformers import BertTokenizer, BertForSequenceClassification, AdamW
 from torch.utils.data import Dataset, DataLoader
 import numpy as np
 import torch
 from util.common import clip_gradient, predict
 from time import time
 from util.csv_log import CSVLog
 from util.evaluation import evaluate
 from util.early_stop import EarlyStopping
 from torch.optim.lr_scheduler import StepLR
 from sklearn.model_selection import train_test_split
 import argparse
 def get_model(n_out):
    print('# Initializing model ...')
    model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
    return model
 def set_method_name():
    return 'mBERT'
 def init_optimizer(model, lr):
    # return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)],
         'weight_decay': opt.weight_decay},
        {'params': [p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)],
         'weight_decay': opt.weight_decay}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    return optimizer
 def init_logfile(method_name, opt):
    logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
    logfile.set_default('dataset', opt.dataset)
    logfile.set_default('run', opt.seed)
    logfile.set_default('method', method_name)
    assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated'
    return logfile
 def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']
 def get_dataset_name(datapath):
    possible_splits = [str(i) for i in range(10)]
    splitted = datapath.split('_')
    id_split = splitted[-1].split('.')[0][-1]
    if id_split in possible_splits:
        dataset_name = splitted[0].split('/')[-1]
        return f'{dataset_name}_run{id_split}'
 def load_datasets(datapath):
    data = MultilingualDataset.load(datapath)
    data.set_view(languages=['nl'])   # Testing with just two langs
    data.show_dimensions()
    l_devel_raw, l_devel_target = data.training(target_as_csr=False)
    l_test_raw, l_test_target = data.test(target_as_csr=False)
    return l_devel_raw, l_devel_target, l_test_raw, l_test_target
 def do_tokenization(l_dataset, max_len=512):
    print('# Starting Tokenization ...')
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    langs = l_dataset.keys()
    l_tokenized = {}
    for lang in langs:
        l_tokenized[lang] = tokenizer(l_dataset[lang],
                                      truncation=True,
                                      max_length=max_len,
                                      add_special_tokens=True,
                                      padding='max_length')
    return l_tokenized
 class TrainingDataset(Dataset):
    """
    data: dict of lang specific tokenized data
    labels: dict of lang specific targets
    """
    def __init__(self, data, labels):
        self.langs = data.keys()
        self.lang_ids = {lang:identifier for identifier, lang in enumerate(self.langs)}
        for i, lang in enumerate(self.langs):
            # print(lang)
            _data = data[lang]['input_ids']
            _data = np.array(_data)
            _labels = labels[lang]
            _lang_value = np.full(len(_data), self.lang_ids[lang])
            if i == 0:
                self.data = _data
                self.labels = _labels
                self.lang_index = _lang_value
            else:
                self.data = np.vstack((self.data, _data))
                self.labels = np.vstack((self.labels, _labels))
                self.lang_index = np.concatenate((self.lang_index, _lang_value))
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        x = self.data[idx]
        y = self.labels[idx]
        lang = self.lang_index[idx]
        return x, torch.tensor(y, dtype=torch.float), lang
        # return x, y, lang
    def get_lang_ids(self):
        return self.lang_ids
 def freeze_encoder(model):
    for param in model.base_model.parameters():
        param.requires_grad = False
    return model
 def check_param_grad_status(model):
    print('#'*50)
    print('Model paramater status')
    for name, child in model.named_children():
        trainable = False
        for param in child.parameters():
            if param.requires_grad:
                trainable = True
        if not trainable:
            print(f'{name} is frozen')
        else:
            print(f'{name} is not frozen')
    print('#'*50)
 def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile):
    _dataset_path = opt.dataset.split('/')[-1].split('_')
    # dataset_id = 'RCV1/2_run0_newBert'
    dataset_id = _dataset_path[0] + _dataset_path[-1]
    loss_history = []
    model.train()
    for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
        # optim.zero_grad()
        out = model(batch.cuda())
        loss = criterion(out[0], target.cuda())
        loss.backward()
        clip_gradient(model)
        optim.step()
        loss_history.append(loss.item())
        if idx % opt.log_interval == 0:
            interval_loss = np.mean(loss_history[-opt.log_interval:])
            print(
                f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
    mean_loss = np.mean(interval_loss)
    logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
    return mean_loss
 def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix):
    print('# Validating model ...')
    loss_history = []
    model.eval()
    langs = lang_ids.keys()
    id_2_lang = {v:k for k,v in lang_ids.items()}
    predictions = {l: [] for l in langs}
    yte_stacked = {l: [] for l in langs}
    for batch, target, lang_idx in test_dataloader:
        out = model(batch.cuda())
        logits = out[0]
        loss = criterion(logits, target.cuda()).item()
        prediction = predict(logits)
        loss_history.append(loss)
        # Assigning prediction to dict in predictionS and yte_stacked according to lang_idx
        for i, pred in enumerate(prediction):
            lang_pred = id_2_lang[lang_idx.numpy()[i]]
            predictions[lang_pred].append(pred)
            yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
    ly = {l: np.vstack(yte_stacked[l]) for l in langs}
    ly_ = {l: np.vstack(predictions[l]) for l in langs}
    l_eval = evaluate(ly, ly_)
    metrics = []
    for lang in langs:
        macrof1, microf1, macrok, microk = l_eval[lang]
        metrics.append([macrof1, microf1, macrok, microk])
        if measure_prefix == 'te':
            print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
    Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
    print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
    mean_loss = np.mean(loss_history)
    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
    return Mf1
 def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
    l_split_va = l_tokenized_tr
    l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
    l_split_tr = l_tokenized_tr
    l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
    for lang in l_tokenized_tr.keys():
        val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
        l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[lang] = \
            train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, random_state=seed, shuffle=True)
    return  l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
 def main():
    print('Running main ...')
    DATAPATH = opt.dataset
    method_name = set_method_name()
    logfile = init_logfile(method_name, opt)
    l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
    l_tokenized_tr = do_tokenization(l_devel_raw, max_len=512)
    l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop=0.2, max_val=2000, seed=opt.seed)
    l_tokenized_te = do_tokenization(l_test_raw, max_len=512)
    tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
    va_dataset = TrainingDataset(l_split_va, l_split_val_target)
    te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
    tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
    va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=False)
    te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False)
    # Initializing model
    model = get_model(73)
    model = model.cuda()
    criterion = torch.nn.BCEWithLogitsLoss().cuda()
    optim = init_optimizer(model, lr=opt.lr)
    # lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
    early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
                               checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_dataset_name(opt.dataset)}')
    # lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optim, num_warmup_steps= , num_training_steps=)
    # print(model)
    # Freezing encoder
    # model = freeze_encoder(model)
    check_param_grad_status(model)
    # Training loop
    tinit = time()
    lang_ids = va_dataset.lang_ids
    for epoch in range(1, opt.nepochs+1):
        print('# Start Training ...')
        train(model, tr_dataloader, epoch, criterion, optim, 'TestingBert', tinit, logfile)
        # lr_scheduler.step(epoch=None) # reduces the learning rate
        # validation
        macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
        early_stop(macrof1, epoch)
        if opt.test_each>0:
            if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
                test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
        if early_stop.STOP:
            print('[early-stop] STOP')
            if not opt.plotmode:
                break
    if opt.plotmode==False:
        print('-' * 80)
        print('Training over. Performing final evaluation')
        model = early_stop.restore_checkpoint()
        if opt.val_epochs>0:
            print(f'running last {opt.val_epochs} training epochs on the validation set')
            for val_epoch in range(1, opt.val_epochs + 1):
                train(model, va_dataloader, epoch+val_epoch, criterion, optim, 'TestingBert', tinit, logfile)
        # final test
        print('Training complete: testing')
        test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
    exit('Code Executed!')
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model')
    parser.add_argument('--dataset', type=str, default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
                        metavar='datasetpath', help=f'path to the pickled dataset')
    parser.add_argument('--nepochs', type=int, default=200, metavar='int',
                        help='number of epochs (default: 200)')
    parser.add_argument('--lr', type=float, default=2e-5, metavar='float',
                        help='learning rate (default: 2e-5)')
    parser.add_argument('--weight_decay', type=float, default=0, metavar='float',
                        help='weight decay (default: 0)')
    parser.add_argument('--patience', type=int, default=10, metavar='int',
                        help='patience for early-stop (default: 10)')
    parser.add_argument('--log-interval', type=int, default=20, metavar='int',
                        help='how many batches to wait before printing training status')
    parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str',
                        help='path to the log csv file')
    parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
    parser.add_argument('--force', action='store_true', default=False,
                        help='do not check if this experiment has already been run')
    parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str',
                        help='path to the directory containing checkpoints')
    parser.add_argument('--plotmode', action='store_true', default=False,
                        help='in plot mode executes a long run in order '
                             'to generate enough data to produce trend plots (test-each should be >0. This mode is '
                             'used to produce plots, and does not perform an evaluation on the test set.')
    parser.add_argument('--test-each', type=int, default=0, metavar='int',
                        help='how many epochs to wait before invoking test (default: 0, only at the end)')
    parser.add_argument('--val-epochs', type=int, default=1, metavar='int',
                        help='number of training epochs to perform on the validation set once training is over (default 1)')
    opt = parser.parse_args()
    # Testing different parameters ...
    opt.weight_decay = 0.01
    opt.patience = 5
    main()
    # TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size 
--- a/src/results/results_manager.py
+++ b/src/results/results_manager.py
@ -1,7 +1,11 @@
 import pandas as pd
 import numpy as np
-df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t')
+# df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t')
-pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'embed'], aggfunc=[np.mean, np.std])
+df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t')
-print(pivot)
+pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std])
-print('Finished ...')
+with pd.option_context('display.max_rows', None):
    print(pivot.round(3))
 print('Finished ...')
--- a/src/run_mbert_rcv.sh
+++ b/src/run_mbert_rcv.sh
@ -0,0 +1,11 @@
 #!/usr/bin/env bash
 dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
 logfile=../log/log_Mbert_rcv.csv
 runs='0 1 2 3 4 5 6 7 8 9'
 for run in $runs
 do
  dataset=$dataset_path$run.pickle
  python new_mbert.py --dataset $dataset --log-file $logfile --test-each 20
 done
--- a/src/util/SIF_embed.py
+++ b/src/util/SIF_embed.py
@ -17,7 +17,7 @@ def get_weighted_average(We, x, w):
 def compute_pc(X,npc=1):
    """
-    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
+    Compute the principal components.
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
--- a/src/util/common.py
+++ b/src/util/common.py
@ -1,4 +1,5 @@
 import warnings
 import time
 from sklearn.svm import SVC
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import train_test_split
@ -143,6 +144,15 @@ class Index:
            embedding_parts.append(F)
        make_dumps = False
        if make_dumps:
            print(f'Dumping Embedding Matrices ...')
            import pickle
            with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile:
                pickle.dump((self.lang, embedding_parts, self.word2index), outfile)
            with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2:
                pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2)
        self.embedding_matrix = torch.cat(embedding_parts, dim=1)
        print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
@ -155,6 +165,7 @@ class MultilingualIndex:
    def __init__(self): #, add_language_trace=False):
        self.l_index = {}
        self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
        # self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000)
        # self.add_language_trace=add_language_trace
    def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
@ -189,30 +200,42 @@ class MultilingualIndex:
            #     pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1)
-    def posterior_probabilities(self, max_training_docs_by_lang=5000):
+    def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
        # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
        timeit = time.time()
        lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
        lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
-        for l in self.langs:
+        if not stored_post:
-            n_elements = lXtr[l].shape[0]
+            for l in self.langs:
-            if n_elements > max_training_docs_by_lang:
+                n_elements = lXtr[l].shape[0]
-                choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
+                if n_elements > max_training_docs_by_lang:
-                lXtr[l] = lXtr[l][choice]
+                    choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
-                lYtr[l] = lYtr[l][choice]
+                    lXtr[l] = lXtr[l][choice]
                    lYtr[l] = lYtr[l][choice]
-        # train the posterior probabilities embedder
+            # train the posterior probabilities embedder
-        print('[posteriors] training a calibrated SVM')
+            print('[posteriors] training a calibrated SVM')
-        learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
+            learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
-        prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
+            prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
-        prob_embedder.fit(lXtr, lYtr)
+            prob_embedder.fit(lXtr, lYtr)
-        # transforms the training, validation, and test sets into posterior probabilities
+            # transforms the training, validation, and test sets into posterior probabilities
-        print('[posteriors] generating posterior probabilities')
+            print('[posteriors] generating posterior probabilities')
-        lPtr = prob_embedder.transform(self.get_lXtr())
+            lPtr = prob_embedder.transform(self.get_lXtr())
-        lPva = prob_embedder.transform(self.get_lXva())
+            lPva = prob_embedder.transform(self.get_lXva())
-        lPte = prob_embedder.transform(self.get_lXte())
+            lPte = prob_embedder.transform(self.get_lXte())
-
+        # NB: Check splits indices !
-        print('[posteriors] done')
+            if store_posteriors:
                import pickle
                with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
                    pickle.dump([lPtr, lPva, lPte], outfile)
                    print(f'Successfully dumped posteriors!')
        else:
            import pickle
            with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
                lPtr, lPva, lPte = pickle.load(infile)
                print(f'Successfully loaded stored posteriors!')
        print(f'[posteriors] done in {time.time() - timeit}')
        return lPtr, lPva, lPte
    def get_lXtr(self):
--- a/src/util/early_stop.py
+++ b/src/util/early_stop.py
@ -6,7 +6,7 @@ from util.file import create_if_not_exist
 class EarlyStopping:
-    def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
+    def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
        # set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
        self.patience_limit = patience
        self.patience = patience
@ -16,9 +16,10 @@ class EarlyStopping:
        self.stop_time  = None
        self.checkpoint = checkpoint
        self.model = model
        self.optimizer = optimizer
        self.STOP = False
-    def __call__(self, watch_score, epoch):
+    def __call__(self, watch_score, epoch): #model
        if self.STOP: return #done
@ -29,6 +30,9 @@ class EarlyStopping:
            if self.checkpoint:
                self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
                torch.save(self.model, self.checkpoint)
                # with open(self.checkpoint)
                # torch.save({'state_dict': self.model.state_dict(),
                #             'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint)
            else:
                self.print(f'[early-stop] improved')
            self.patience = self.patience_limit
@ -46,6 +50,7 @@ class EarlyStopping:
        self.patience=self.patience_limit
    def restore_checkpoint(self):
        print(f'restoring best model from epoch {self.best_epoch}...')
        return torch.load(self.checkpoint)
    def print(self, msg):
--- a/src/util/results.py
+++ b/src/util/results.py
@ -5,8 +5,23 @@ import numpy as np
 class PolylingualClassificationResults:
    def __init__(self, file, autoflush=True, verbose=False):
        self.file = file
-        self.columns = ['method', 'learner', 'embed', 'embed_type', 'pca_s', 'pca_u', 'optimp', 'dataset', 'time',
+        self.columns = ['method',
-                        'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
+                        'learner',
                        'optimp',
                        'sif',
                        'zscore',
                        'l2',
                        'wescaler',
                        'pca',
                        'id',
                        'dataset',
                        'time',
                        'lang',
                        'macrof1',
                        'microf1',
                        'macrok',
                        'microk',
                        'notes']
        self.autoflush = autoflush
        self.verbose = verbose
        if os.path.exists(file):
@ -21,8 +36,8 @@ class PolylingualClassificationResults:
    def already_calculated(self, id):
        return (self.df['id'] == id).any()
-    def add_row(self, method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
+    def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
-        s = pd.Series([method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
+        s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
        self.df = self.df.append(s, ignore_index=True)
        if self.autoflush: self.flush()
        self.tell(s.to_string())
--- a/src/util_transformers/StandardizeTransformer.py
+++ b/src/util_transformers/StandardizeTransformer.py
--- a/src/util_transformers/init.py
+++ b/src/util_transformers/init.py
--- a/src/util_transformers/clesa.py
+++ b/src/util_transformers/clesa.py
--- a/src/util_transformers/dci.py
+++ b/src/util_transformers/dci.py
--- a/src/util_transformers/riboc.py
+++ b/src/util_transformers/riboc.py