baseline multilingual Bert

2020-07-27 11:56:09 +02:00 · 2020-07-27 11:56:09 +02:00 · d1fdad5f6e
parent 22b7ea7e66
commit d1fdad5f6e
37 changed files with 1212 additions and 1112 deletions
--- a/src/embeddings/embeddings.py
+++ b/src/embeddings/embeddings.py
@ -1,10 +1,7 @@
 import os
-import pickle
 from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
-from embeddings.supervised import get_supervised_embeddings
-from util.decompositions import *
 from util.SIF_embed import *


@ -35,122 +32,10 @@ class PretrainedEmbeddings(ABC):
        return source_idx, target_idx


-class WordEmbeddings:
-
-    def __init__(self, lang, we, worddim):
-        self.lang = lang
-        self.we = we
-        self.worddim = worddim
-        self.dimword = {v:k for k,v in self.worddim.items()}
-
-    @classmethod
-    def load(cls, basedir, lang, word_preprocessor=None, dopickle=True):
-        filename = 'wiki.multi.{}.vec'.format(lang)
-        we_path = os.path.join(basedir, filename)
-
-        if dopickle and os.path.exists(we_path + '.pkl'):
-            print('loading pkl in {}'.format(we_path + '.pkl'))
-            (worddim, we) = pickle.load(open(we_path + '.pkl', 'rb'))
-        else:
-            word_registry = set()
-            lines = open(we_path).readlines()
-            nwords, dims = [int(x) for x in lines[0].split()]
-            print('reading we of {} dimensions'.format(dims))
-            we = np.zeros((nwords, dims), dtype=float)
-            worddim = {}
-            index = 0
-            for i, line in enumerate(lines[1:]):
-                if (i + 1) % 100 == 0:
-                    print('\r{}/{}'.format(i + 1, len(lines)), end='')
-                word, *vals = line.split()
-                wordp = word_preprocessor(word) if word_preprocessor is not None else word
-                if wordp:
-                    wordp = wordp[0]
-                    if wordp in word_registry:
-                        print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp))
-                    elif len(vals) == dims:
-                        worddim[wordp] = index
-                        we[index, :] = np.array(vals).astype(float)
-                        index += 1
-                # else:
-                #     print('warning: word <{}> generates an empty string after preprocessing'.format(word))
-            we = we[:index]
-            print('load {} words'.format(index))
-            if dopickle:
-                print('saving...')
-                pickle.dump((worddim, we), open(we_path + '.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
-
-        return WordEmbeddings(lang, we, worddim)
-
-    def vocabulary(self):
-        return set(self.worddim.keys())
-
-    def __getitem__(self, key):
-        return self.we[self.worddim[key]]
-
-    def dim(self):
-        return self.we.shape[1]
-
-    def __contains__(self, key):
-        return key in self.worddim
-
-    def most_similar(self, word_vect, k):
-        if word_vect.ndim == 1:
-            word_vect = word_vect.reshape(1,-1)
-        assert word_vect.shape[1] == self.dim(), 'inconsistent dimensions'
-
-        sim = np.dot(word_vect,self.we.T)
-        order = np.argsort(-1*sim, axis=1)[:,:k]
-
-        similar_words = [[self.dimword[order[vi,ki]] for ki in range(k)] for vi in range(word_vect.shape[0])]
-        sim_scores = sim[:,order]
-        return similar_words, sim_scores
-
-    def get_vectors(self, wordlist):
-        indexes = np.array([self.worddim[w] for w in wordlist])
-        return self.we[indexes]
-
-    def restrict(self, vocabulary):
-        # vocabulary is a set of terms to be kept
-        active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
-        lost = len(vocabulary)-len(active_vocabulary)
-        if lost > 0:    # some terms are missing, so it will be replaced by UNK
-            print('warning: missing {} terms for lang {}'.format(lost, self.lang))
-        self.we = self.get_vectors(active_vocabulary)
-        assert self.we.shape[0] == len(active_vocabulary)
-        self.dimword={i:w for i,w in enumerate(active_vocabulary)}
-        self.worddim={w:i for i,w in enumerate(active_vocabulary)}
-        return self
-
-    @classmethod
-    def load_poly(cls, basedir, langs, lang_vocabularies=None, word_preprocessor=None):
-        if lang_vocabularies is None:
-            return cls.merge([cls.load(basedir,lang, word_preprocessor) for lang in langs])
-        else:
-            # assert all([l in lang_vocabularies for l in langs]), 'missing vocabulary for some languages'
-            return cls.merge([cls.load(basedir, lang, word_preprocessor).restrict(lang_vocabularies[lang]) for lang in langs])
-
-    @classmethod
-    def merge(cls, we_list):
-        assert all([isinstance(we, WordEmbeddings) for we in we_list]), \
-            'instances of {} expected'.format(WordEmbeddings.__name__)
-
-        polywe = []
-        worddim = {}
-        offset = 0
-        for we in we_list:
-            polywe.append(we.we)
-            worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
-            offset = len(worddim)
-        polywe = np.vstack(polywe)
-
-        return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
-
-
 class FastTextWikiNews(Vectors):

    url_base = 'Cant auto-download MUSE embeddings'
-    path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
+    path = '../embeddings/wiki.multi.{}.vec'
    _name = '/wiki.multi.{}.vec'

    def __init__(self, cache, language="en", **kwargs):
@ -159,42 +44,13 @@ class FastTextWikiNews(Vectors):
        super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)


-
-class EmbeddingsAligned(Vectors):
-
-    def __init__(self, type, path, lang, voc):
-        # todo - rewrite as relative path
-        self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
-        self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
-        self.path = path + self.name.format(lang)
-        assert os.path.exists(path), f'pre-trained vectors not found in {path}'
-        super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
-        self.vectors = self.extract(voc)
-
-    def vocabulary(self):
-        return set(self.stoi.keys())
-
-    def extract(self, words):
-        source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
-        extraction = torch.zeros((len(words), self.dim))
-        extraction[source_idx] = self.vectors[target_idx]
-        return extraction
-
-    def reduce(self, dim):
-        pca = PCA(n_components=dim)
-        self.vectors = pca.fit_transform(self.vectors)
-        return
-
-
 class FastTextMUSE(PretrainedEmbeddings):
-
    def __init__(self, path, lang, limit=None):
        super().__init__()
        print(f'Loading fastText pretrained vectors for language {lang} from {path}')
        assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
        self.embed = FastTextWikiNews(path, lang, max_vectors=limit)

-
    def vocabulary(self):
        return set(self.embed.stoi.keys())

@ -204,114 +60,8 @@ class FastTextMUSE(PretrainedEmbeddings):
    def extract(self, words):
        source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
        extraction = torch.zeros((len(words), self.dim()))
-        # extraction = torch.empty(len(words), self.dim()).normal_(0, 1)
        extraction[source_idx] = self.embed.vectors[target_idx]
        return extraction


-class StorageEmbeddings:
-    def __init__(self, path):
-        self.path = path
-        self.lang_U = dict()
-        self.lang_S = dict()

-    def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
-        for lang in docs.keys():
-            print(f'# [unsupervised-matrix {type}] for {lang}')
-            voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
-            self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
-            print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
-            nC = self.lang_U[lang].shape[1]
-        if max_label_space == 0:
-            print(f'Computing optimal number of PCA components along matrices U')
-            optimal_n = get_optimal_dim(self.lang_U, 'U')
-            self.lang_U = run_pca(optimal_n, self.lang_U)
-        elif max_label_space < nC:
-            print(f'Applying PCA to unsupervised matrix U')
-            self.lang_U = run_pca(max_label_space, self.lang_U)
-
-        return
-
-    def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
-        only_well_represented_C = False  # TODO testing
-        if only_well_represented_C:
-            labels = labels.copy()
-            min_prevalence = 0
-            print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...')
-            langs = list(docs.keys())
-            well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs])
-            for lang in langs:
-                labels[lang] = labels[lang][:, well_repr_cats]
-            print(f'Target number reduced to: {labels[lang].shape[1]}\n')
-
-        for lang in docs.keys():    # compute supervised matrices S - then apply PCA
-            print(f'# [supervised-matrix] for {lang}')
-            self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
-                                                          reduction, max_label_space, voc[lang], lang)
-            nC = self.lang_S[lang].shape[1]
-            print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
-
-        if max_label_space == 0:    # looking for best n_components analyzing explained_variance_ratio
-            print(f'Computing optimal number of PCA components along matrices S')
-            optimal_n = get_optimal_dim(self.lang_S, 'S')
-            print(f'Applying PCA(n_components={optimal_n})')
-            self.lang_S = run_pca(optimal_n, self.lang_S)
-        elif max_label_space == -1: # applying pca to the verticals stacked matrix of WCE embeddings
-            print(f'Computing PCA on vertical stacked WCE embeddings')
-            languages = self.lang_S.keys()
-            _temp_stack = np.vstack([self.lang_S[lang] for lang in languages])  # stacking WCE vertically
-            stacked_pca = PCA(n_components=_temp_stack.shape[1])
-            stacked_pca.fit(_temp_stack)
-            best_n = None
-            _r = stacked_pca.explained_variance_ratio_
-            _r = np.cumsum(_r)
-            plt.plot(_r, label='Stacked Supervised')
-            for i in range(len(_r) - 1, 1, -1):
-                delta = _r[i] - _r[i - 1]
-                if delta > 0:
-                    best_n = i
-                    break
-            plt.show()
-            stacked_pca = PCA(n_components=best_n)
-            stacked_pca.fit(_temp_stack)
-            print(f'Applying PCA(n_components={i}')
-            for lang in languages:
-                self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
-        elif max_label_space <= nC: # less or equal in order to reduce it to the same initial dimension
-            print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})')
-            self.lang_S = run_pca(max_label_space, self.lang_S)
-
-        return
-
-    def SIF_embeddings(self):
-        print('todo') # TODO
-
-    def _concatenate_embeddings(self, docs):
-        _r = dict()
-        for lang in self.lang_U.keys():
-            _r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang])))
-        return _r
-
-    def fit(self, config, docs, vocs, labels):
-        if config['unsupervised']:
-            self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
-        if config['supervised']:
-            self._add_embeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
-        return self
-
-    def predict(self, config, docs):
-        if config['supervised'] and config['unsupervised']:
-            return self._concatenate_embeddings(docs)
-            # todo testing applying pca to hstack muse + wce
-            # _reduced = self._concatenate_embeddings(docs)
-            # return run_pca(300, _reduced)
-        elif config['supervised']:
-            _r = dict()
-            for lang in docs.keys():
-                _r[lang] = docs[lang].dot(self.lang_S[lang])
-        else:
-            _r = dict()
-            for lang in docs.keys():
-                _r[lang] = docs[lang].dot(self.lang_U[lang])
-
-        return _r
--- a/src/embeddings/pretrained.py
+++ b/src/embeddings/pretrained.py
@ -1,103 +1,102 @@
 from abc import ABC, abstractmethod
 import torch, torchtext
-import gensim
-import os
+# import gensim
+# import os
 import numpy as np


-class KeyedVectors:
-
-    def __init__(self, word2index, weights):
-        assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
-        index2word = {i:w for w,i in word2index.items()}
-        assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
-        self.word2index = word2index
-        self.index2word = index2word
-        self.weights = weights
-
-    def extract(self, words):
-        dim = self.weights.shape[1]
-        v_size = len(words)
-
-        source_idx, target_idx = [], []
-        for i,word in enumerate(words):
-            if word not in self.word2index: continue
-            j = self.word2index[word]
-            source_idx.append(i)
-            target_idx.append(j)
-
-        extraction = np.zeros((v_size, dim))
-        extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
-
-        return extraction
+# class KeyedVectors:
+#
+#     def __init__(self, word2index, weights):
+#         assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
+#         index2word = {i:w for w,i in word2index.items()}
+#         assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
+#         self.word2index = word2index
+#         self.index2word = index2word
+#         self.weights = weights
+#
+#     def extract(self, words):
+#         dim = self.weights.shape[1]
+#         v_size = len(words)
+#
+#         source_idx, target_idx = [], []
+#         for i,word in enumerate(words):
+#             if word not in self.word2index: continue
+#             j = self.word2index[word]
+#             source_idx.append(i)
+#             target_idx.append(j)
+#
+#         extraction = np.zeros((v_size, dim))
+#         extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
+#
+#         return extraction


-
-class PretrainedEmbeddings(ABC):
-
-    def __init__(self):
-        super().__init__()
-
-    @abstractmethod
-    def vocabulary(self): pass
-
-    @abstractmethod
-    def dim(self): pass
-
-    @classmethod
-    def reindex(cls, words, word2index):
-        source_idx, target_idx = [], []
-        for i, word in enumerate(words):
-            if word not in word2index: continue
-            j = word2index[word]
-            source_idx.append(i)
-            target_idx.append(j)
-        source_idx = np.asarray(source_idx)
-        target_idx = np.asarray(target_idx)
-        return source_idx, target_idx
+# class PretrainedEmbeddings(ABC):
+#
+#     def __init__(self):
+#         super().__init__()
+#
+#     @abstractmethod
+#     def vocabulary(self): pass
+#
+#     @abstractmethod
+#     def dim(self): pass
+#
+#     @classmethod
+#     def reindex(cls, words, word2index):
+#         source_idx, target_idx = [], []
+#         for i, word in enumerate(words):
+#             if word not in word2index: continue
+#             j = word2index[word]
+#             source_idx.append(i)
+#             target_idx.append(j)
+#         source_idx = np.asarray(source_idx)
+#         target_idx = np.asarray(target_idx)
+#         return source_idx, target_idx


-class GloVe(PretrainedEmbeddings):
-
-    def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
-        super().__init__()
-        print(f'Loading GloVe pretrained vectors from torchtext')
-        self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
-        print('Done')
-
-    def vocabulary(self):
-        return set(self.embed.stoi.keys())
-
-    def dim(self):
-        return self.embed.dim
-
-    def extract(self, words):
-        source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
-        extraction = torch.zeros((len(words), self.dim()))
-        extraction[source_idx] = self.embed.vectors[target_idx]
-        return extraction
+# class GloVe(PretrainedEmbeddings):
+#
+#     def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
+#         super().__init__()
+#         print(f'Loading GloVe pretrained vectors from torchtext')
+#         self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
+#         print('Done')
+#
+#     def vocabulary(self):
+#         return set(self.embed.stoi.keys())
+#
+#     def dim(self):
+#         return self.embed.dim
+#
+#     def extract(self, words):
+#         source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
+#         extraction = torch.zeros((len(words), self.dim()))
+#         extraction[source_idx] = self.embed.vectors[target_idx]
+#         return extraction


-class Word2Vec(PretrainedEmbeddings):
-
-    def __init__(self, path, limit=None):
-        super().__init__()
-        print(f'Loading word2vec pretrained vectors from {path}')
-        assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
-        self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
-        self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
-        print('Done')
-
-    def vocabulary(self):
-        return set(self.word2index.keys())
-
-    def dim(self):
-        return self.embed.vector_size
-
-    def extract(self, words):
-        source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
-        extraction = np.zeros((len(words), self.dim()))
-        extraction[source_idx] = self.embed.vectors[target_idx]
-        extraction = torch.from_numpy(extraction).float()
-        return extraction
+# class Word2Vec(PretrainedEmbeddings):
+#
+#     def __init__(self, path, limit=None):
+#         super().__init__()
+#         print(f'Loading word2vec pretrained vectors from {path}')
+#         assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
+#         self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
+#         self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
+#         print('Done')
+#
+#     def vocabulary(self):
+#         return set(self.word2index.keys())
+#
+#     def dim(self):
+#         return self.embed.vector_size
+#
+#     def extract(self, words):
+#         source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
+#         extraction = np.zeros((len(words), self.dim()))
+#         extraction[source_idx] = self.embed.vectors[target_idx]
+#         extraction = torch.from_numpy(extraction).float()
+#         return extraction

--- a/src/embeddings/supervised.py
+++ b/src/embeddings/supervised.py
@ -1,7 +1,5 @@
 from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
 import numpy as np
-# from sklearn.decomposition import PCA
-# from sklearn.manifold import TSNE


 def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
@ -69,31 +67,6 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la

    return F

-    # if nC >= max_label_space:
-    #     if reduction == 'PCA':
-    #         if max_label_space == 0:
-    #             pca = PCA(n_components=Y.shape[1])
-    #             pca = pca.fit(F)
-    #             return pca.explained_variance_ratio_
-    #
-    #         print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
-    #               f'Applying PCA(n_components={max_label_space})')
-    #         pca = PCA(n_components=max_label_space)
-    #         F = pca.fit_transform(F)
-    #     elif reduction == 'TSNE':
-    #         print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
-    #               f'Applying t-SNE(n_components={max_label_space})')
-    #         tsne = TSNE(n_components=max_label_space)
-    #         F = tsne.fit_transform(F)
-    #     elif reduction == 'tSVD':
-    #         print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
-    #               f'Applying truncatedSVD(n_components={max_label_space})')
-    #         tSVD = TruncatedSVD(n_components=max_label_space)
-    #         F = tSVD.fit_transform(F)
-    #
-    # return F
-
-



--- a/src/experiment_scripts/10run_dl_jrc.sh
+++ b/src/experiment_scripts/10run_dl_jrc.sh
@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
+logfile=../log/log10run_dl_jrc.csv
+
+runs='0 1 2 3 4 5 6 7 8 9'
+for run in $runs
+do
+  dataset=$dataset_path$run.pickle
+  python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
+done
--- a/src/experiment_scripts/10run_dl_rcv.sh
+++ b/src/experiment_scripts/10run_dl_rcv.sh
@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
+logfile=../log/log10run_dl_rcv.csv
+
+runs='0 1 2 3 4 5 6 7 8 9'
+for run in $runs
+do
+  dataset=$dataset_path$run.pickle
+  python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
+done
--- a/src/experiment_scripts/10run_jrc.sh
+++ b/src/experiment_scripts/10run_jrc.sh
@ -0,0 +1,12 @@
+dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
+logfile=./results/10run_jrc_final_results.csv
+
+runs='0 1 2 3 4 5 6 7 8 9'
+for run in $runs
+do
+  dataset=$dataset_path$run.pickle
+  python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
+  python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
+  python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
+
+done
--- a/src/experiment_scripts/10run_jrc_combinations.sh
+++ b/src/experiment_scripts/10run_jrc_combinations.sh
@ -0,0 +1,16 @@
+dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
+logfile=./results/funnelling_10run_jrc_CIKM.csv
+
+runs='6 7 8 9' #0 1 2 3 4 5
+for run in $runs
+do
+  dataset=$dataset_path$run.pickle
+  #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated  (done up to run5)
+  python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
+  #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
+  #python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
+  #python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
+  #python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
+  #python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2
+  #python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2
+done
--- a/src/experiment_scripts/10run_rcv.sh
+++ b/src/experiment_scripts/10run_rcv.sh
@ -0,0 +1,15 @@
+dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
+logfile=./results/10run_rcv_final_results.csv
+
+runs='0 1 2 3 4 5 6 7 8 9'
+
+for run in $runs
+do
+  dataset=$dataset_path$run.pickle
+  python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
+  python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
+  python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
+
+done
+
+
--- a/src/experiment_scripts/10run_rcv_combinations.sh
+++ b/src/experiment_scripts/10run_rcv_combinations.sh
@ -0,0 +1,16 @@
+dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
+logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv
+
+runs='0 1 2 3 4 5 6 7 8 9'
+for run in $runs
+do
+  dataset=$dataset_path$run.pickle
+  #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated
+  python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
+  #python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
+  #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
+  #python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
+  #python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
+  #python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2
+  #python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2
+done
--- a/src/experiment_scripts/run_combinations_jrc.sh
+++ b/src/experiment_scripts/run_combinations_jrc.sh
@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
+logfile=./results/final_combinations_jrc.csv
+#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
+#	- exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
+#		(no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
+
+# aggregation=concatenation
+#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2
+#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2
+#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2
+#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2
+#
+
+##FeatureSetToPosteriors (aggregation mean)
+python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
+python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
+python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
+python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
+
+##FeatureSetToPosteriors
+#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob
+#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob
+#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob
+#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
+
+#MajorityVoting
+#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
+#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
+#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
+#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r
+
+
--- a/src/experiment_scripts/run_combinations_rcv.sh
+++ b/src/experiment_scripts/run_combinations_rcv.sh
@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
+logfile=./results/final_combinations_rcv.csv
+#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
+#	- exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
+#		(no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
+
+# aggregation=concatenation
+#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2
+#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2
+#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2
+#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2
+#
+##FeatureSetToPosteriors (aggregation mean)
+python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
+python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
+python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
+python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
+
+##FeatureSetToPosteriors
+#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob
+#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob
+#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob
+#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
+
+#MajorityVoting
+#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
+#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
+#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
+#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r
--- a/src/experiment_scripts/run_dl_jrc.sh
+++ b/src/experiment_scripts/run_dl_jrc.sh
@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+logfile=../log/log_pre_jrc.csv
+dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20
+
+python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20
+
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
+
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
+python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20
--- a/src/experiment_scripts/run_dl_rcv.sh
+++ b/src/experiment_scripts/run_dl_rcv.sh
@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
+python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20
+
+python main_deep_learning.py $dataset --supervised --plotmode --test-each 20
+python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20
+python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20
+python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20
+
+python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
+
+python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
+python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20
--- a/src/experiment_scripts/run_fulljrc_dl.sh
+++ b/src/experiment_scripts/run_fulljrc_dl.sh
@ -0,0 +1,16 @@
+dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
+seeds='5' #2 3 4 5 6 7 8 9 10'
+for seed in $seeds
+do
+  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed
+  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
+  python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed  --force
+
+  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained  --seed $seed
+  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable  --seed $seed
+
+  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained  --seed $seed
+  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed
+  #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed  --force
+
+done
--- a/src/experiment_scripts/run_fullrcv_dl.sh
+++ b/src/experiment_scripts/run_fullrcv_dl.sh
@ -0,0 +1,20 @@
+dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
+seeds='1 ' #2 3 4 5' # 6 7 8 9 10'
+for seed in $seeds
+do
+  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed
+  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
+  python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200
+
+
+
+  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained  --seed $seed
+  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable  --seed $seed
+
+  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained  --seed $seed
+  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed
+
+#  python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed
+#  python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200
+  #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed
+done
--- a/src/experiment_scripts/run_traditional_jrc.sh
+++ b/src/experiment_scripts/run_traditional_jrc.sh
@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
+
+######################################## POSTERIORS
+                                                                                  # Posteriors
+python main_multimodal_cls.py $dataset -P                                         # + zscore
+python main_multimodal_cls.py $dataset -P -z                                      # +l2norm
+python main_multimodal_cls.py $dataset -P -z --l2                                 # +feature weight
+
+
+######################################### WCE
+                                                                                  #WCE supervised
+python main_multimodal_cls.py $dataset -S                                         # + zscore
+python main_multimodal_cls.py $dataset -S -z                                      # +l2norm
+python main_multimodal_cls.py $dataset -S -z --l2                                 # +feature weight
+python main_multimodal_cls.py $dataset -S -z -r --l2                               # + SIF - PCA
+
+python main_multimodal_cls.py $dataset -S -z -p 250 --l2                           # +feature weight + pca
+python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2                        # + SIF
+
+python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig                # -feature weight
+python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
+python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig           # + pca
+python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig
+
+
+python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
+python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
+python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi
+python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi
+
+################################# MUSE
+
+                                                                                  # MUSE unsupervised
+python main_multimodal_cls.py $dataset -U                                         # + zscore
+python main_multimodal_cls.py $dataset -U -z                                      # +l2norm
+python main_multimodal_cls.py $dataset -U -z --l2                                 # +feature weight
+python main_multimodal_cls.py $dataset -U -z -r --l2                              # + SIF - PCA
+
+python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig                # -feature weight + pca
+python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
+
+python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
+python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi
--- a/src/experiment_scripts/run_traditional_rcv.sh
+++ b/src/experiment_scripts/run_traditional_rcv.sh
@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
+
+######################################## POSTERIORS
+                                                                                  # Posteriors
+python main_multimodal_cls.py $dataset -P                                         # + zscore
+python main_multimodal_cls.py $dataset -P -z                                      # +l2norm
+python main_multimodal_cls.py $dataset -P -z --l2                                 # +feature weight
+
+
+######################################### WCE
+                                                                                  #WCE supervised
+python main_multimodal_cls.py $dataset -S                                         # + zscore
+python main_multimodal_cls.py $dataset -S -z                                      # +l2norm
+python main_multimodal_cls.py $dataset -S -z --l2                                 # +feature weight
+python main_multimodal_cls.py $dataset -S -z -r --l2                               # + SIF - PCA
+
+python main_multimodal_cls.py $dataset -S -z -p 50 --l2                           # +feature weight + pca
+python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2                        # + SIF
+
+python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig                # -feature weight
+python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
+python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig           # + pca
+python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig
+
+
+python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
+python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
+python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi
+python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi
+
+################################# MUSE
+
+                                                                                  # MUSE unsupervised
+python main_multimodal_cls.py $dataset -U                                         # + zscore
+python main_multimodal_cls.py $dataset -U -z                                      # +l2norm
+python main_multimodal_cls.py $dataset -U -z --l2                                 # +feature weight
+python main_multimodal_cls.py $dataset -U -z -r --l2                              # + SIF - PCA
+
+python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig                # -feature weight + pca
+python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
+
+python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
+python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi
--- a/src/experiment_scripts/time_comparison.sh
+++ b/src/experiment_scripts/time_comparison.sh
@ -0,0 +1,6 @@
+dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
+seeds='1 2 3 4 5 6 7 8 9 10'
+for seed in $seeds
+do
+  python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised  --nepochs 50 --seed $seed
+  done
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@ -1,15 +1,15 @@
 import numpy as np
 import time
-from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
+# from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
 from scipy.sparse import issparse
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.model_selection import GridSearchCV
-from sklearn.model_selection import KFold
+# from sklearn.model_selection import KFold
 from joblib import Parallel, delayed
-from sklearn.feature_extraction.text import TfidfVectorizer
-from transformers.StandardizeTransformer import StandardizeTransformer
-from sklearn.decomposition import PCA
-from models.cnn_class_bu import CNN_pdr
+# from sklearn.feature_extraction.text import TfidfVectorizer
+# from util_transformers.StandardizeTransformer import StandardizeTransformer
+# from sklearn.decomposition import PCA
+# from models.cnn_class_bu import CNN_pdr


 def _sort_if_sparse(X):
@ -40,154 +40,154 @@ class TrivialRejector:
    def best_params(self): return {}


-class FunnellingPolylingualClassifier:
-    """
-    This classifier projects each document d into a language-independent feature space where each dimension fi is the
-    decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
-    then trains one single classifier for all documents in this space, irrespective of their originary language
-    """
-    def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
-                 calmode='cal', n_jobs=-1):
-        """
-        :param first_tier_learner: the learner used in the first-tier level
-        :param meta_learner: the learner used in the second-tier level
-        :param first_tier_parameters: parameters for the learner in the doc_projector
-        :param meta_parameters: parameters for the learner in the z-space
-        :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
-        :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
-        :param n_jobs: number of parallel threads
-        'sigmoid' to use the sigmoid of the decision_function
-        projects the data before training the final classifier; if greater than one, the training set is split in as
-        many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
-        models trained on the remaining folds. This should increase the generality of the space to unseen data.
-        """
-        assert folded_projections>0, "positive number of folds expected"
-        assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
-        assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
-
-        self.fist_tier_learner = first_tier_learner
-        self.meta_learner = meta_learner
-        self.fist_tier_parameters=first_tier_parameters
-        self.meta_parameters = meta_parameters
-        self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
-        self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
-        self.folded_projections = folded_projections
-        self.n_jobs = n_jobs
-        self.calmode = calmode
-
-    def _projection(self, doc_projector, lX):
-        """
-        Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
-        decision_function if otherwise
-        :param doc_projector: the document projector (a NaivePolylingualClassifier)
-        :param lX: {lang:matrix} to train
-        :return: the projection, applied with predict_proba or decision_function
-        """
-        if self.calmode=='cal':
-            return doc_projector.predict_proba(lX)
-        else:
-            l_decision_scores = doc_projector.decision_function(lX)
-            if self.calmode=='sigmoid':
-                def sigmoid(x): return 1 / (1 + np.exp(-x))
-                for lang in l_decision_scores.keys():
-                    l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
-            return l_decision_scores
-
-    def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
-        """
-        Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
-        decision scores (if otherwise). This space is here named zspace.
-        :param lXtr: {lang:matrix} to train
-        :param lYtr: {lang:labels} to train
-        :param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
-        :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
-        :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
-        models trained on lXtr, and the lYproj labels stacked consistently
-        """
-        repair_empty_folds = True
-        if lXproj is None and lYproj is None:
-            lXproj, lYproj = lXtr, lYtr
-            repair_empty_folds = False
-
-        print('fitting the projectors... {}'.format(lXtr.keys()))
-        self.doc_projector.fit(lXtr, lYtr)
-
-        print('projecting the documents')
-        langs = list(lXtr.keys())
-        lZ = self._projection(self.doc_projector, lXproj)
-
-        # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
-        empty_categories = self.doc_projector.empty_categories
-        lZ_bu = self._projection(self.doc_projector_bu, lXproj)
-
-        for lang in langs:
-            repair = empty_categories[lang]
-            lZ[lang][:,repair] = lZ_bu[lang][:,repair]
-
-        Z = np.vstack([lZ[lang] for lang in langs])  # Z is the language independent space
-        zy = np.vstack([lYproj[lang] for lang in langs])
-        return Z, zy
-
-    def _get_zspace_folds(self, lX, ly):
-        self.doc_projector_bu.fit(lX, ly)
-
-        print('split of {} folds'.format(self.folded_projections))
-        skf = KFold(n_splits=self.folded_projections, shuffle=True)
-
-        Z, zy = [], []
-        lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
-        for fold in range(self.folded_projections):
-            print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
-            lfoldXtr, lfoldYtr = {}, {}
-            lfoldXte, lfoldYte = {}, {}
-            for lang in lX.keys():
-                train, test = lfold[lang][fold]
-                lfoldXtr[lang] = lX[lang][train]
-                lfoldYtr[lang] = ly[lang][train]
-                lfoldXte[lang] = lX[lang][test]
-                lfoldYte[lang] = ly[lang][test]
-            Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
-            Z.append(Zfold)
-            zy.append(zYfold)
-        # compose the Z-space as the union of all folded predictions
-        Z = np.vstack(Z)
-        zy = np.vstack(zy)
-        # refit the document projector with all examples to have a more reliable projector for test data
-        self.doc_projector = self.doc_projector_bu
-        return Z, zy
-
-    def fit(self, lX, ly, lZ=None, lzy=None):
-        tinit = time.time()
-        Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
-
-        #experimental: adds the posterior probabilities (computed outside) to the meta-classifier
-        if lZ is not None and lzy is not None:
-            zlangs = list(lZ.keys())
-            Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
-            zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
-
-        print('fitting the Z-space of shape={}'.format(Z.shape))
-        self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
-        self.model.fit(Z, zy)
-        self.time = time.time() - tinit
-
-        return self
-
-    def predict(self, lX, lZ=None):
-        """
-        :param lX: a dictionary {language_label: X csr-matrix}
-        :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
-        :return: a dictionary of predictions
-        """
-        lZ_ = self._projection(self.doc_projector, lX)
-        if lZ is not None:
-            lZ_ = {**lZ_, **lZ}
-        return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
-
-    def best_params(self):
-        params = self.doc_projector.best_params()
-        params['meta'] = self.model.best_params()
-        return params
+# class FunnellingPolylingualClassifier:
+#     """
+#     This classifier projects each document d into a language-independent feature space where each dimension fi is the
+#     decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
+#     then trains one single classifier for all documents in this space, irrespective of their originary language
+#     """
+#     def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
+#                  calmode='cal', n_jobs=-1):
+#         """
+#         :param first_tier_learner: the learner used in the first-tier level
+#         :param meta_learner: the learner used in the second-tier level
+#         :param first_tier_parameters: parameters for the learner in the doc_projector
+#         :param meta_parameters: parameters for the learner in the z-space
+#         :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
+#         :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
+#         :param n_jobs: number of parallel threads
+#         'sigmoid' to use the sigmoid of the decision_function
+#         projects the data before training the final classifier; if greater than one, the training set is split in as
+#         many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
+#         models trained on the remaining folds. This should increase the generality of the space to unseen data.
+#         """
+#         assert folded_projections>0, "positive number of folds expected"
+#         assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
+#         assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
+#
+#         self.fist_tier_learner = first_tier_learner
+#         self.meta_learner = meta_learner
+#         self.fist_tier_parameters=first_tier_parameters
+#         self.meta_parameters = meta_parameters
+#         self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
+#         self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
+#         self.folded_projections = folded_projections
+#         self.n_jobs = n_jobs
+#         self.calmode = calmode
+#
+#     def _projection(self, doc_projector, lX):
+#         """
+#         Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
+#         decision_function if otherwise
+#         :param doc_projector: the document projector (a NaivePolylingualClassifier)
+#         :param lX: {lang:matrix} to train
+#         :return: the projection, applied with predict_proba or decision_function
+#         """
+#         if self.calmode=='cal':
+#             return doc_projector.predict_proba(lX)
+#         else:
+#             l_decision_scores = doc_projector.decision_function(lX)
+#             if self.calmode=='sigmoid':
+#                 def sigmoid(x): return 1 / (1 + np.exp(-x))
+#                 for lang in l_decision_scores.keys():
+#                     l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
+#             return l_decision_scores
+#
+#     def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
+#         """
+#         Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
+#         decision scores (if otherwise). This space is here named zspace.
+#         :param lXtr: {lang:matrix} to train
+#         :param lYtr: {lang:labels} to train
+#         :param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
+#         :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
+#         :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
+#         models trained on lXtr, and the lYproj labels stacked consistently
+#         """
+#         repair_empty_folds = True
+#         if lXproj is None and lYproj is None:
+#             lXproj, lYproj = lXtr, lYtr
+#             repair_empty_folds = False
+#
+#         print('fitting the projectors... {}'.format(lXtr.keys()))
+#         self.doc_projector.fit(lXtr, lYtr)
+#
+#         print('projecting the documents')
+#         langs = list(lXtr.keys())
+#         lZ = self._projection(self.doc_projector, lXproj)
+#
+#         # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
+#         empty_categories = self.doc_projector.empty_categories
+#         lZ_bu = self._projection(self.doc_projector_bu, lXproj)
+#
+#         for lang in langs:
+#             repair = empty_categories[lang]
+#             lZ[lang][:,repair] = lZ_bu[lang][:,repair]
+#
+#         Z = np.vstack([lZ[lang] for lang in langs])  # Z is the language independent space
+#         zy = np.vstack([lYproj[lang] for lang in langs])
+#         return Z, zy
+#
+#     def _get_zspace_folds(self, lX, ly):
+#         self.doc_projector_bu.fit(lX, ly)
+#
+#         print('split of {} folds'.format(self.folded_projections))
+#         skf = KFold(n_splits=self.folded_projections, shuffle=True)
+#
+#         Z, zy = [], []
+#         lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
+#         for fold in range(self.folded_projections):
+#             print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
+#             lfoldXtr, lfoldYtr = {}, {}
+#             lfoldXte, lfoldYte = {}, {}
+#             for lang in lX.keys():
+#                 train, test = lfold[lang][fold]
+#                 lfoldXtr[lang] = lX[lang][train]
+#                 lfoldYtr[lang] = ly[lang][train]
+#                 lfoldXte[lang] = lX[lang][test]
+#                 lfoldYte[lang] = ly[lang][test]
+#             Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
+#             Z.append(Zfold)
+#             zy.append(zYfold)
+#         # compose the Z-space as the union of all folded predictions
+#         Z = np.vstack(Z)
+#         zy = np.vstack(zy)
+#         # refit the document projector with all examples to have a more reliable projector for test data
+#         self.doc_projector = self.doc_projector_bu
+#         return Z, zy
+#
+#     def fit(self, lX, ly, lZ=None, lzy=None):
+#         tinit = time.time()
+#         Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
+#
+#         #experimental: adds the posterior probabilities (computed outside) to the meta-classifier
+#         if lZ is not None and lzy is not None:
+#             zlangs = list(lZ.keys())
+#             Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
+#             zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
+#
+#         print('fitting the Z-space of shape={}'.format(Z.shape))
+#         self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
+#         self.model.fit(Z, zy)
+#         self.time = time.time() - tinit
+#
+#         return self
+#
+#     def predict(self, lX, lZ=None):
+#         """
+#         :param lX: a dictionary {language_label: X csr-matrix}
+#         :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
+#         :return: a dictionary of predictions
+#         """
+#         lZ_ = self._projection(self.doc_projector, lX)
+#         if lZ is not None:
+#             lZ_ = {**lZ_, **lZ}
+#         return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
+#
+#     def best_params(self):
+#         params = self.doc_projector.best_params()
+#         params['meta'] = self.model.best_params()
+#         return params


 class NaivePolylingualClassifier:
@ -322,411 +322,4 @@ class MonolingualClassifier:
        return self.model.predict(X)

    def best_params(self):
-        return self.best_params_
-
-
-class FunnellingMultimodal(FunnellingPolylingualClassifier):
-    def __init__(self,
-                 we_path,
-                 config,
-                 first_tier_learner,
-                 meta_learner,
-                 first_tier_parameters=None,
-                 meta_parameters=None,
-                 folded_projections=1,
-                 calmode='cal',
-                 n_jobs=-1):
-
-        super().__init__(first_tier_learner,
-                         meta_learner,
-                         first_tier_parameters,
-                         meta_parameters,
-                         folded_projections,
-                         calmode,
-                         n_jobs)
-
-        self.pca_independent_space = PCA(n_components=50)
-        self.we_path = we_path
-        self.config = config
-        self.lang_word2idx = dict()
-        self.languages = []
-        self.lang_tfidf = {}
-        self.embedding_space = None
-        self.model = None
-        self.time = None
-        self.best_components = 'not set'    # if auto optimize pca, it will store the optimal number of components
-
-    def vectorize(self, lX, prediction=False):
-        langs = list(lX.keys())
-        print(f'# tfidf-vectorizing docs')
-        if prediction:
-
-            for lang in langs:
-                assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
-                tfidf_vectorizer = self.lang_tfidf[lang]
-                lX[lang] = tfidf_vectorizer.transform(lX[lang])
-            return self
-
-        for lang in langs:
-            tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
-            self.languages.append(lang)
-            tfidf_vectorizer.fit(lX[lang])
-            lX[lang] = tfidf_vectorizer.transform(lX[lang])
-            self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
-            self.lang_tfidf[lang] = tfidf_vectorizer
-        return self
-
-    def _get_zspace(self, lXtr, lYtr):
-        print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
-        self.doc_projector.fit(lXtr, lYtr)
-
-        print('\nprojecting the documents')
-        lZ = self._projection(self.doc_projector, lXtr)
-
-        return lZ, lYtr
-
-    def fit(self, lX, ly):
-        tinit = time.time()
-        print('Vectorizing documents...')
-        self.vectorize(lX)
-
-        for lang in self.languages:
-            print(f'{lang}->{lX[lang].shape}')
-
-        Z, zy = self._get_zspace(lX, ly)
-
-        if self.config['supervised'] or self.config['unsupervised']:
-            self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
-            _embedding_space = self.embedding_space.transform(self.config, lX)
-            if self.config['max_label_space'] == 0:
-                _cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1]
-                if _cum_dimension - 300 > 0:
-                    _temp = _cum_dimension - 300
-                else:
-                    _temp = _cum_dimension
-                self.best_components = _temp
-            # h_stacking posterior probabilities with (U) and/or (S) matrices
-            for lang in self.languages:
-                Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
-
-        # stacking Z space vertically
-        _vertical_Z = np.vstack([Z[lang] for lang in self.languages])
-        _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
-
-        self.standardizer = StandardizeTransformer()
-        _vertical_Z = self.standardizer.fit_transform(_vertical_Z)
-
-        # todo testing ...
-        # if self.config['post_pca']:
-        #     print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
-        #     self.pca_independent_space.fit(_vertical_Z)
-        #     _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
-
-        print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
-        self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
-                                           n_jobs=self.n_jobs)
-        self.model.fit(_vertical_Z, _vertical_Zy)
-        self.time = time.time() - tinit
-        print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
-
-    def predict(self, lX, ly):
-        print('Vectorizing documents')
-        self.vectorize(lX, prediction=True)
-        lZ = self._projection(self.doc_projector, lX)
-
-        if self.config['supervised'] or self.config['unsupervised']:
-            _embedding_space = self.embedding_space.transform(self.config, lX)
-
-            for lang in lX.keys():
-                lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
-
-        for lang in lZ.keys():
-            print(lZ[lang].shape)
-            # todo testing
-            lZ[lang] = self.standardizer.transform(lZ[lang])
-            # if self.config['post_pca']:
-            #     print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
-            #     lZ[lang] = self.pca_independent_space.transform(lZ[lang])
-
-        return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
-
-
-class PolylingualEmbeddingsClassifier:
-    """
-    This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article
-    @article{conneau2017word,
-      title={Word translation without parallel data},
-      author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}},
-      journal={arXiv preprint arXiv:1710.04087},
-      year={2017}
-    }
-    url: https://github.com/facebookresearch/MUSE
-    """
-    def __init__(self, wordembeddings_path, config, learner, c_parameters=None, n_jobs=-1):
-        """
-        :param wordembeddings_path: the path to the directory containing the polylingual embeddings
-        :param learner: the learner
-        :param c_parameters: parameters for learner
-        :param n_jobs: the number of concurrent threads
-        """
-        self.wordembeddings_path = wordembeddings_path
-        self.config = config
-        self.learner = learner
-        self.c_parameters=c_parameters
-        self.n_jobs = n_jobs
-        self.lang_tfidf = {}
-        self.model = None
-        self.languages = []
-        self.lang_word2idx = dict()
-        self.embedding_space = None
-
-    def fit_vectorizers(self, lX):
-        for lang in lX.keys():
-            if lang not in self.lang_tfidf:
-                tfidf = TfidfVectorizer(sublinear_tf=True, use_idf=True)  # text is already processed
-                docs = lX[lang]
-                tfidf.fit(docs)
-                self.lang_tfidf[lang] = tfidf
-
-
-    def vectorize(self, lX, prediction=False):
-        langs = list(lX.keys())
-        print(f'# tfidf-vectorizing docs')
-        if prediction:
-
-            for lang in langs:
-                assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
-                tfidf_vectorizer = self.lang_tfidf[lang]
-                lX[lang] = tfidf_vectorizer.transform(lX[lang])
-            return self
-
-        for lang in langs:
-            tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
-            self.languages.append(lang)
-            tfidf_vectorizer.fit(lX[lang])
-            lX[lang] = tfidf_vectorizer.transform(lX[lang])
-            self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
-            self.lang_tfidf[lang] = tfidf_vectorizer
-        return self
-
-    def embed(self, docs, lang):
-        assert lang in self.lang_tfidf, 'unknown language'
-        tfidf_vectorizer = self.lang_tfidf[lang]
-        V = tfidf_vectorizer.vocabulary_
-        Xweights = tfidf_vectorizer.transform(docs)
-
-        print('loading word embeddings for ' + lang)
-        we = WordEmbeddings.load(self.wordembeddings_path, lang)
-
-        nD = len(docs)
-        doc_vecs = np.zeros((nD, we.dim()))
-
-        for i, doc in enumerate(docs):
-            print('\r\tcomplete {:.3f}%'.format(100 * (i + 1) / nD), end='')
-            # averaging with tfidf (summing each word only once, since the frequency is already controlled)
-            for w in set(doc.split()):
-                if w in we and w in V:
-                    doc_vecs[i] += (we[w] * Xweights[i, V[w]])
-            # works much worse with idf; works much worse with document l2-normalization
-        print()
-
-        return doc_vecs
-
-    def fit(self, lX, ly):
-        """
-        :param lX: a dictionary {language_label: [list of preprocessed documents]}
-        :param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
-        :return: self
-        """
-        tinit = time.time()
-        langs = list(lX.keys())
-        WEtr, Ytr = [], []
-        # self.fit_vectorizers(lX) # if already fit, does nothing
-        self.vectorize(lX)
-        # config = {'unsupervised' : False, 'supervised': True}
-        self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX,  self.lang_word2idx, ly)
-        WEtr = self.embedding_space.transform(self.config, lX)
-        # for lang in langs:
-        #     WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices
-        #     Ytr.append(ly[lang])
-
-        WEtr = np.vstack([WEtr[lang] for lang in langs])
-        Ytr = np.vstack([ly[lang] for lang in langs])
-        self.embed_time = time.time() - tinit
-
-        print('fitting the WE-space of shape={}'.format(WEtr.shape))
-        self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs)
-        self.model.fit(WEtr, Ytr)
-        self.time = time.time() - tinit
-        return self
-
-    def predict(self, lX, lY):
-        """
-        :param lX: a dictionary {language_label: [list of preprocessed documents]}
-        """
-        assert self.model is not None, 'predict called before fit'
-        self.vectorize(lX, prediction=True)
-        langs = list(lX.keys())
-        lWEte = self.embedding_space.transform(self.config, lX)
-        # lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
-        return _joblib_transform_multiling(self.model.transform, lWEte, n_jobs=self.n_jobs)
-
-    def predict_proba(self, lX):
-        """
-        :param lX: a dictionary {language_label: [list of preprocessed documents]}
-        """
-        assert self.model is not None, 'predict called before fit'
-        langs = list(lX.keys())
-        lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
-        return _joblib_transform_multiling(self.model.predict_proba, lWEte, n_jobs=self.n_jobs)
-
-    def best_params(self):
-        return self.model.best_params()
-
-
-class MonolingualNetSvm:
-    """
-    testing: funnelling with NN managing word embeddings compositionality. An ensemble of n-SVMs (n equals to the
-    number of training languages) is first fit on the data, generating the documents projection in the Z-space. Next,
-    the projection are fed to a single NN with their respective document embeddings. The documents are projected into
-    the embedding space while preserving their dimensionality (output dim is 300). These projection are horizonatally
-    concatenated with the respective projection and passed through a fC layer with sigmoid act and output dim equal
-    to the number of target classes.
-    # TODO ATM testing with only 1 language
-    """
-    def __init__(self, lX, ly, first_tier_learner, first_tier_parameters, n_jobs):
-        self.lX = lX
-        self.ly = ly
-        # SVM Attributes
-        self.doc_projector = NaivePolylingualClassifier(first_tier_learner, first_tier_parameters,
-                                                        n_jobs=n_jobs)
-        self.calmode = 'cal'
-        self.languages = []
-        self.lang_word2idx = dict()
-        self.lang_tfidf = {}
-        self.base_learner = 'TODO'
-        self.parameters = 'TODO'
-        # NN Attributes
-        self.NN = 'TODO'
-
-
-    def load_preprocessed(self):
-        """
-        in order to speed up the process, documents are first tokenized in the "main". Here, tokenized docs, word_index, and
-        targets are loaded.
-        :return: dict[lang] = (word_index, tokenized_docs, targets)
-        """
-        import pickle
-        with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
-            return pickle.load(f)
-
-    def _build_embedding_matrix(self, lang, word_index):
-        """
-        build embedding matrix by filtering out OOV embeddings
-        :param lang:
-        :param word_index:
-        :return: filtered embedding matrix
-        """
-        from embeddings.embeddings import EmbeddingsAligned
-        type = 'MUSE'
-        path = '/home/andreapdr/CLESA/'
-        MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
-        return MUSE
-
-    def get_data_and_embed(self, data_dict):
-        from keras.preprocessing.sequence import pad_sequences
-
-        langs = data_dict.keys()
-        lang_embedding_matrix = dict()
-        nn_lXtr = dict()
-        nn_lytr = dict()
-
-        for lang in langs:
-            lang_embedding_matrix[lang] = self._build_embedding_matrix(lang, data_dict[lang][0])
-            nn_lXtr[lang] = pad_sequences(data_dict[lang][1], 100, padding='post')
-            nn_lytr[lang] = [data_dict[lang][2]]
-
-        return  nn_lXtr, nn_lytr, lang_embedding_matrix
-
-    def svm_vectorize(self, lX, prediction=False):
-        langs = list(lX.keys())
-        print(f'# tfidf-vectorizing docs')
-        if prediction:
-            for lang in langs:
-                assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
-                tfidf_vectorizer = self.lang_tfidf[lang]
-                lX[lang] = tfidf_vectorizer.transform(lX[lang])
-            return self
-        for lang in langs:
-            tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
-            self.languages.append(lang)
-            tfidf_vectorizer.fit(lX[lang])
-            lX[lang] = tfidf_vectorizer.transform(lX[lang])
-            self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
-            self.lang_tfidf[lang] = tfidf_vectorizer
-        return lX
-
-    def _get_zspace(self, lXtr, lYtr):
-        print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
-        self.doc_projector.fit(lXtr, lYtr)
-
-        print('\nprojecting the documents')
-        lZ = self._projection(self.doc_projector, lXtr)
-
-        return lZ, lYtr
-
-    def _projection(self, doc_projector, lX):
-        """
-        Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
-        decision_function if otherwise
-        :param doc_projector: the document projector (a NaivePolylingualClassifier)
-        :param lX: {lang:matrix} to train
-        :return: the projection, applied with predict_proba or decision_function
-        """
-        if self.calmode=='cal':
-            return doc_projector.predict_proba(lX)
-        else:
-            l_decision_scores = doc_projector.decision_function(lX)
-            if self.calmode=='sigmoid':
-                def sigmoid(x): return 1 / (1 + np.exp(-x))
-                for lang in l_decision_scores.keys():
-                    l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
-            return l_decision_scores
-
-    def fit(self):
-        """
-        # 1. Fit SVM to generate posterior probabilities:
-        #   1.1 Gather documents and vectorize them as in other SVM classifiers
-        # 2. Fit NN
-        #   2.1 Gather documents and build NN dataset by indexing wrt embedding matrix
-        #   2.2 Fit NN first-layer to generate compositional doc embedding
-        #   2.3 H-stack doc-embed and posterior P
-        #   2.4 Feed stacked vector to output layer (sigmoid act): output Nc
-        #   2.5 Train it...
-        """
-
-        # load pre-processed data
-        data_dict = self.load_preprocessed()
-        # build embedding matrices and neural network document training set
-        nn_lXtr, nn_lytr, lang_embedding_matrix = self.get_data_and_embed(data_dict)
-        # TF-IDF vectorzing documents for SVM classifier
-        svm_lX = self.svm_vectorize(self.lX)
-
-        # just testing on a smaller subset of data
-        test_svm_lX = dict()
-        test_svm_ly = dict()
-        test_svm_lX['it'] = svm_lX['it'][:10, :]
-        test_svm_ly['it'] = self.ly['it'][:10, :]
-        test_nn_data = nn_lXtr['it'][:10]
-
-        # projecting document into Z space by SVM
-        svm_Z, _ = self._get_zspace(test_svm_lX, test_svm_ly)
-
-        # initializing net and forward pass
-        net = CNN_pdr(73, 1, 300, len(lang_embedding_matrix['it'].vectors), 300, lang_embedding_matrix['it'].vectors)
-        out = net.forward(test_nn_data, svm_Z['it'])
-
-        print('TODO')
-
-    def net(self):
-        pass
+        return self.best_params_
--- a/src/learning/transformers.py
+++ b/src/learning/transformers.py
@ -10,7 +10,7 @@ import time
 from sklearn.decomposition import PCA
 from joblib import Parallel, delayed
 from scipy.sparse import issparse, vstack, hstack
-from transformers.StandardizeTransformer import StandardizeTransformer
+from util_transformers.StandardizeTransformer import StandardizeTransformer
 from util.SIF_embed import remove_pc
 from sklearn.preprocessing import normalize
 from sklearn.svm import SVC
@ -127,22 +127,26 @@ class PosteriorProbabilitiesEmbedder:
        print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents')
        return self.doc_projector.predict_proba(lX)

+    def _get_output_dim(self):
+        return len(self.doc_projector.model['da'].model.classes_)
+

 class MuseEmbedder:

-    def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight()):
+    def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False):
        self.path=path
        self.lV = lV
        self.l2 = l2
        self.n_jobs = n_jobs
        self.featureweight = featureweight
+        self.sif = sif

    def fit(self, lX, ly, lV=None):
        assert lV is not None or self.lV is not None, 'lV not specified'
        self.langs = sorted(lX.keys())
        self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
        lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
-        self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE}
+        self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE.items()}
        self.featureweight.fit(lX, ly)
        return self

@ -150,7 +154,7 @@ class MuseEmbedder:
        MUSE = self.MUSE
        lX = self.featureweight.transform(lX)
        XdotMUSE = Parallel(n_jobs=self.n_jobs)(
-            delayed(XdotM)(lX[lang], MUSE[lang]) for lang in self.langs
+            delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs
        )
        lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
        lMuse = _normalize(lMuse, self.l2)
@ -162,14 +166,18 @@ class MuseEmbedder:
    def _get_wordlist_from_word2index(self, word2index):
        return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0]

+    def _get_output_dim(self):
+        return self.MUSE['da'].shape[1]
+

 class WordClassEmbedder:

-    def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight()):
+    def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False):
        self.n_jobs = n_jobs
        self.l2 = l2
        self.max_label_space=max_label_space
        self.featureweight = featureweight
+        self.sif = sif

    def fit(self, lX, ly, lV=None):
        self.langs = sorted(lX.keys())
@ -184,7 +192,7 @@ class WordClassEmbedder:
        lWCE = self.lWCE
        lX = self.featureweight.transform(lX)
        XdotWCE = Parallel(n_jobs=self.n_jobs)(
-            delayed(XdotM)(lX[lang], lWCE[lang])for lang in self.langs
+            delayed(XdotM)(lX[lang], lWCE[lang], self.sif)for lang in self.langs
        )
        lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
        lwce = _normalize(lwce, self.l2)
@ -193,6 +201,9 @@ class WordClassEmbedder:
    def fit_transform(self, lX, ly, lV=None):
        return self.fit(lX, ly).transform(lX)

+    def _get_output_dim(self):
+        return 73
+

 class DocEmbedderList:

@ -201,6 +212,7 @@ class DocEmbedderList:
        if len(embedder_list)==0: embedder_list=[]
        self.embedders = embedder_list
        self.aggregation = aggregation
+        print(f'Aggregation mode: {self.aggregation}')

    def fit(self, lX, ly, lV=None):
        for transformer in self.embedders:
@ -238,16 +250,25 @@ class DocEmbedderList:
        langs = sorted(lX.keys())

        lZparts = {l: None for l in langs}
+        # min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
+        min_dim = 300
        for transformer in self.embedders:
            lZ = transformer.transform(lX)
+            nC = min([lZ[lang].shape[1] for lang in langs])
            for l in langs:
                Z = lZ[l]
+                if Z.shape[1] > min_dim:
+                    print(f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
+                          f'Applying PCA(n_components={min_dim})')
+                    pca = PCA(n_components=min_dim)
+                    Z = pca.fit(Z).transform(Z)
                if lZparts[l] is None:
                    lZparts[l] = Z
                else:
                    lZparts[l] += Z

        n_transformers = len(self.embedders)
+        nC = min([lZparts[lang].shape[1] for lang in langs])

        return {l:lZparts[l] / n_transformers for l in langs}

@ -266,7 +287,7 @@ class FeatureSet2Posteriors:
        self.transformer = transformer
        self.l2=l2
        self.n_jobs = n_jobs
-        self.prob_classifier = MetaClassifier(SVC(kernel='rbf', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
+        self.prob_classifier = MetaClassifier(SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)

    def fit(self, lX, ly, lV=None):
        if lV is None and hasattr(self.transformer, 'lV'):
@ -412,11 +433,13 @@ def word_class_embedding_matrix(X, Y, max_label_space=300):
    return WCE


-def XdotM(X,M):
+def XdotM(X,M, sif):
    # return X.dot(M)
-    # print(f'X={X.shape}, M={M.shape}')
+    print(f'X={X.shape}, M={M.shape}')
    E = X.dot(M)
-    E = remove_pc(E, npc=1)
+    if sif:
+        print("removing pc...")
+        E = remove_pc(E, npc=1)
    return E


--- a/src/main_deep.py
+++ b/src/main_deep.py
@ -1,92 +0,0 @@
-from optparse import OptionParser
-from util.results import PolylingualClassificationResults
-from dataset_builder import MultilingualDataset
-from keras.preprocessing.text import Tokenizer
-from learning.learners import MonolingualNetSvm
-from sklearn.svm import SVC
-import pickle
-
-parser = OptionParser()
-
-parser.add_option("-d", "--dataset", dest="dataset",
-                  help="Path to the multilingual dataset processed and stored in .pickle format",
-                  default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
-
-parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
-                  help="Optimize hyperparameters", default=False)
-
-parser.add_option("-s", "--set_c", dest="set_c",type=float,
-                  help="Set the C parameter", default=1)
-
-(op, args) = parser.parse_args()
-
-
-###################################################################################################################
-
-def get_learner(calibrate=False, kernel='linear'):
-    return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
-
-
-def get_params(dense=False):
-    if not op.optimc:
-        return None
-    c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
-    kernel = 'rbf' if dense else 'linear'
-    return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
-
-
-# PREPROCESS TEXT AND SAVE IT ... both for SVM and NN
-def preprocess_data(lXtr, lXte, lytr, lyte):
-    tokenized_tr = dict()
-    tokenized_te = dict()
-    for lang in lXtr.keys():
-        alltexts = ' '.join(lXtr[lang])
-        tokenizer = Tokenizer()
-        tokenizer.fit_on_texts(alltexts.split(' '))
-        tokenizer.oov_token = len(tokenizer.word_index)+1
-        # dumping train set
-        sequences_tr = tokenizer.texts_to_sequences(lXtr[lang])
-        tokenized_tr[lang] = (tokenizer.word_index, sequences_tr, lytr[lang])
-        # dumping test set
-        sequences_te = tokenizer.texts_to_sequences(lXte[lang])
-        tokenized_te[lang] = (tokenizer.word_index, sequences_te, lyte[lang])
-
-    with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'wb') as f:
-        pickle.dump(tokenized_tr, f)
-
-    with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_test.pickle', 'wb') as f:
-        pickle.dump(tokenized_tr, f)
-
-    print('Successfully dumped data')
-
-# def load_preprocessed():
-#     with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
-#         return pickle.load(f)
-#
-# def build_embedding_matrix(lang, word_index):
-#     type = 'MUSE'
-#     path = '/home/andreapdr/CLESA/'
-#     MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
-#     return MUSE
-
-
-########## MAIN #################################################################################################
-
-if __name__ == '__main__':
-    results = PolylingualClassificationResults('./results/NN_FPEC_results.csv')
-    data = MultilingualDataset.load(op.dataset)
-    lXtr, lytr = data.training()
-    lXte, lyte = data.test()
-
-    if op.set_c != -1:
-        meta_parameters = None
-    else:
-        meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
-
-    test_architecture = MonolingualNetSvm(lXtr,
-                                          lytr,
-                                          first_tier_learner=get_learner(calibrate=True),
-                                          first_tier_parameters=None,
-                                          n_jobs=1)
-
-    test_architecture.fit()
--- a/src/main_deep_learning.py
+++ b/src/main_deep_learning.py
@ -1,6 +1,6 @@
 import argparse
 import torch.nn as nn
-from torch.optim.lr_scheduler import StepLR
+from torch.optim.lr_scheduler import StepLR, MultiStepLR
 from dataset_builder import MultilingualDataset
 from learning.transformers import load_muse_embeddings
 from models.lstm_class import RNNMultilingualClassifier
@ -9,8 +9,6 @@ from util.early_stop import EarlyStopping
 from util.common import *
 from util.file import create_if_not_exist
 from time import time
-from embeddings.pretrained import *
-from os.path import join
 from tqdm import tqdm
 from util.evaluation import evaluate
 from util.file import get_file_name
@ -100,7 +98,7 @@ def main():

    # Loading the dataset
    data = MultilingualDataset.load(opt.dataset)
-    # data.set_view(languages=['de', 'fr', 'sv', 'da', 'es', 'it'])
+    data.set_view(languages=['de', 'fr']) #, 'it', 'en']) # 'sv', 'da', 'es', 'it'])
    data.show_dimensions()
    langs = data.langs()
    l_devel_raw, l_devel_target = data.training(target_as_csr=True)
@ -108,6 +106,7 @@ def main():

    # Loading the MUSE pretrained embeddings (only if requested)
    lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
+    # lpretrained_vocabulary = none_dict(langs)   # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set

    # Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
    multilingual_index = MultilingualIndex()
@ -115,10 +114,26 @@ def main():
    multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
    multilingual_index.embedding_matrices(lpretrained, opt.supervised)
    if opt.posteriors:
-        lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=opt.svm_max_docs)
+        lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000, store_posteriors=True) #stored_post=True) #opt.svm_max_docs)
    else:
        lPtr, lPva, lPte = None, None, None

+    # just_test = False
+    # if just_test:
+    #
+    #     model = torch.load(
+    #         '../checkpoint/rnn(H512)-Muse-WCE-Posteriors-(trainable)-jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')
+    #     criterion = torch.nn.BCEWithLogitsLoss().cuda()
+    #
+    #     # batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
+    #
+    #     batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
+    #     l_test_index = multilingual_index.l_test_index()
+    #     epoch = 1
+    #     tinit = time()
+    #     test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
+    #     exit('Loaded')
+
    # Model initialization
    model = init_Net(data.num_categories(), multilingual_index)

@ -130,7 +145,7 @@ def main():

    tinit = time()
    create_if_not_exist(opt.checkpoint_dir)
-    early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
+    early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')

    l_train_index, l_train_target = multilingual_index.l_train()
    l_val_index, l_val_target = multilingual_index.l_val()
@ -155,7 +170,6 @@ def main():
                break

    # training is over
-
    # restores the best model according to the Mf1 of the validation set (only when plotmode==False)
    # stoptime = early_stop.stop_time - tinit
    # stopepoch = early_stop.best_epoch
@ -164,6 +178,8 @@ def main():
    if opt.plotmode==False:
        print('-' * 80)
        print('Training over. Performing final evaluation')
+
+        # torch.cuda.empty_cache()
        model = early_stop.restore_checkpoint()

        if opt.val_epochs>0:
@ -183,10 +199,14 @@ def get_lr(optimizer):


 def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name):
+    _dataset_path = opt.dataset.split('/')[-1].split('_')
+    dataset_id = _dataset_path[0] + _dataset_path[-1]
+
    loss_history = []
    model.train()
    for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)):
        optim.zero_grad()
+        _out = model(batch,post, lang)
        loss = criterion(model(batch, post, lang), target)
        loss.backward()
        clip_gradient(model)
@ -195,7 +215,7 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,

        if idx % opt.log_interval == 0:
            interval_loss = np.mean(loss_history[-opt.log_interval:])
-            print(f'{opt.dataset} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
+            print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')

    mean_loss = np.mean(interval_loss)
    logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
@ -203,6 +223,8 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,


 def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix):
+
+    loss_history = []
    model.eval()
    langs = sorted(ltest_index.keys())
    predictions = {l:[] for l in langs}
@ -214,6 +236,7 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
        prediction = predict(logits)
        predictions[lang].append(prediction)
        yte_stacked[lang].append(target.detach().cpu().numpy())
+        loss_history.append(loss)

    ly  = {l:np.vstack(yte_stacked[l]) for l in langs}
    ly_ = {l:np.vstack(predictions[l]) for l in langs}
@ -224,17 +247,15 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
        metrics.append([macrof1, microf1, macrok, microk])
        if measure_prefix=='te':
            print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
-        # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
-        #                 (config['max_label_space'], classifier.best_components),
-        #                 config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
-        #                 lang, macrof1, microf1, macrok, microk, '')
    Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
    print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')

-    # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=tend)
-    # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mf1, timelapse=tend)
-    # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-accuracy', value=acc, timelapse=tend)
-    # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=loss, timelapse=tend)
+    mean_loss = np.mean(loss_history)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)

    return Mf1

--- a/src/main_majorityvoting_cls.py
+++ b/src/main_majorityvoting_cls.py
@ -1,7 +1,7 @@
 import os
 from dataset_builder import MultilingualDataset
 # from learning.learners import *
-from learning.learners import FunnellingMultimodal
+# from learning.learners import FunnellingMultimodal
 from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
    TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting
 from util.evaluation import *
@ -14,14 +14,14 @@ from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

 parser = OptionParser()

-parser.add_option("-d", "--dataset", dest="dataset",
-                  help="Path to the multilingual dataset processed and stored in .pickle format",
-                  default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
+# parser.add_option("-d", "--dataset", dest="dataset",
+#                   help="Path to the multilingual dataset processed and stored in .pickle format",
+#                   default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")

 parser.add_option("-o", "--output", dest="output",
                  help="Result file", type=str,  default='./results/results.csv')

-parser.add_option("-P", "--probs", dest="probs", action='store_true',
+parser.add_option("-P", "--probs", dest="posteriors", action='store_true',
                  help="Add posterior probabilities to the document embedding representation", default=False)

 parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
@ -46,6 +46,9 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
                  help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
                  default=300)

+parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
+                  help="Remove common component when computing dot product of word embedding matrices", default=False)
+
 # parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
 #                   help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
 #                        " If set to 0 it will automatically search for the best number of components", default=300)
@ -72,15 +75,18 @@ def get_params(dense=False):
 if __name__ == '__main__':
    (op, args) = parser.parse_args()

-    assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
-    assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
-    assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
+    assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
+    dataset = args[0]

-    dataset_file = os.path.basename(op.dataset)
+    assert exists(dataset), 'Unable to find file '+str(dataset)
+    assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
+    assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
+
+    dataset_file = os.path.basename(dataset)

    results = PolylingualClassificationResults(op.output)

-    data = MultilingualDataset.load(op.dataset)
+    data = MultilingualDataset.load(dataset)
    data.show_dimensions()

    lXtr, lytr = data.training()
@ -88,8 +94,9 @@ if __name__ == '__main__':

    meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]

-    result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
-
+    # result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
+    result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \
+                f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}'
    print(f'{result_id}')

    # text preprocessing
@ -100,7 +107,7 @@ if __name__ == '__main__':
    lV = tfidfvectorizer.vocabulary()

    classifiers = []
-    if op.probs:
+    if op.posteriors:
        classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
    if op.supervised:
        classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S)))
@ -115,13 +122,37 @@ if __name__ == '__main__':
    print('\n# Evaluating ...')
    l_eval = evaluate_method(classifier, lXte, lyte)

+    # renaming arguments to be printed on log
+    _id = ''
+    _id_conf = [op.posteriors, op.supervised, op.pretrained]
+    _id_name = ['+P', '+W', '+M']
+    for i, conf in enumerate(_id_conf):
+        if conf:
+            _id += _id_name[i]
+    _id = _id.lstrip('+')
+    _dataset_path = dataset.split('/')[-1].split('_')
+    dataset_id = _dataset_path[0] + _dataset_path[-1]
+
    metrics = []
    for lang in lXte.keys():
        macrof1, microf1, macrok, microk = l_eval[lang]
        metrics.append([macrof1, microf1, macrok, microk])
        print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
-        # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
-        #                 (config['max_label_space'], classifier.best_components),
-        #                 config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
-        #                 lang, macrof1, microf1, macrok, microk, '')
+        results.add_row(method='Voting',
+                        learner='svm',
+                        optimp=op.optimc,
+                        sif=op.sif,
+                        zscore='todo',
+                        l2='todo',
+                        wescaler='todo',
+                        pca=op.max_labels_S,
+                        id=_id,
+                        dataset=dataset_id,
+                        time='todo',
+                        lang=lang,
+                        macrof1=macrof1,
+                        microf1=microf1,
+                        macrok=macrok,
+                        microk=microk,
+                        notes='')
    print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
--- a/src/main_multimodal_cls.py
+++ b/src/main_multimodal_cls.py
@ -11,7 +11,7 @@ from sklearn.svm import SVC
 parser = OptionParser(usage="usage: %prog datapath [options]")

 parser.add_option("-o", "--output", dest="output",
-                  help="Result file", type=str,  default='./results/results.csv')
+                  help="Result file", type=str,  default='multiModal_log.csv')

 parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true',
                  help="Add posterior probabilities to the document embedding representation", default=False)
@ -22,8 +22,8 @@ parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
 parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
                  help="Add pretrained MUSE embeddings to the document embedding representation", default=False)

-parser.add_option("--nol2", dest="nol2", action='store_true',
-                  help="Deactivates l2 normalization as a post-processing for the document embedding views", default=False)
+parser.add_option("--l2", dest="l2", action='store_true',
+                  help="Activates l2 normalization as a post-processing for the document embedding views", default=False)

 parser.add_option("--allprob", dest="allprob", action='store_true',
                  help="All views are generated as posterior probabilities. This affects the supervised and pretrained "
@ -48,11 +48,28 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
                  help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
                  default=300)

+parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
+                  help="Remove common component when computing dot product of word embedding matrices", default=False)
+
+parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
+                  help="Z-score normalize matrices (WCE and MUSE)", default=False)
+
+parser.add_option("-a", "--agg", dest="agg", action='store_true',
+                  help="Set aggregation function of the common Z-space to average (Default: concatenation)", default=False)
+


 def get_learner(calibrate=False, kernel='linear'):
    return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')

+def get_params():
+    if not op.optimc:
+        return None
+    c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
+    kernel = 'rbf'
+    return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
+
+
 #######################################################################################################################


@ -64,17 +81,23 @@ if __name__ == '__main__':
    assert exists(dataset), 'Unable to find file '+str(dataset)
    assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
    assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
-    l2=(op.nol2==False)
+    l2=op.l2

    dataset_file = os.path.basename(dataset)

-    results = PolylingualClassificationResults(op.output)
+    results = PolylingualClassificationResults('../log/' + op.output)
    allprob='Prob' if op.allprob else ''
    result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \
-        f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}{"_optimC" if op.optimc else ""}'
+        f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}_zscore={op.zscore}{"_optimC" if op.optimc else ""}'
    print(f'{result_id}')

+    # set zscore range - is slice(0,0) mean will be equal to 0 and std to 1, thus normalization will have no effect
+    standardize_range = slice(0,0)
+    if op.zscore:
+        standardize_range = None
+
    data = MultilingualDataset.load(dataset)
+    # data.set_view(languages=['fr', 'it'])
    data.show_dimensions()
    lXtr, lytr = data.training()
    lXte, lyte = data.test()
@ -86,23 +109,23 @@ if __name__ == '__main__':
    feat_weighting = FeatureWeight(op.feat_weight, agg='mean')

    # # document embedding modules
-    doc_embedder = DocEmbedderList(aggregation='concat')
+    doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat')
    if op.posteriors:
        doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2))
    if op.supervised:
-        wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting)
+        wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
        if op.allprob:
            wce = FeatureSet2Posteriors(wce, l2=l2)
        doc_embedder.append(wce)
    if op.pretrained:
-        muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting)
+        muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
        if op.allprob:
            muse = FeatureSet2Posteriors(muse, l2=l2)
        doc_embedder.append(muse)

    # metaclassifier
    meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
-    meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=meta_parameters)
+    meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=get_params(), standardize_range=standardize_range)

    # ensembling the modules
    classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
@ -113,13 +136,40 @@ if __name__ == '__main__':
    print('\n# Evaluating ...')
    l_eval = evaluate_method(classifier, lXte, lyte)

+    # renaming arguments to be printed on log
+    _id = ''
+    _id_conf = [op.posteriors, op.supervised, op.pretrained]
+    _id_name = ['+P', '+W', '+M']
+    for i, conf in enumerate(_id_conf):
+        if conf:
+            _id += _id_name[i]
+    _id = _id.lstrip('+')
+    _id = _id if not op.agg else _id + '_mean'
+    _id = _id if not op.allprob else _id + '_allprob'
+
+    _dataset_path = dataset.split('/')[-1].split('_')
+    dataset_id = _dataset_path[0] + _dataset_path[-1]
+
    metrics = []
    for lang in lXte.keys():
        macrof1, microf1, macrok, microk = l_eval[lang]
        metrics.append([macrof1, microf1, macrok, microk])
        print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
-        # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
-        #                 (config['max_label_space'], classifier.best_components),
-        #                 config['dim_reduction_unsupervised'], op.optimc, dataset.split('/')[-1], classifier.time,
-        #                 lang, macrof1, microf1, macrok, microk, '')
+        results.add_row(method='MultiModal',
+                        learner='svm',
+                        optimp=op.optimc,
+                        sif= op.sif,
+                        zscore=op.zscore,
+                        l2= op.l2,
+                        wescaler= op.feat_weight,
+                        pca=op.max_labels_S,
+                        id=_id,
+                        dataset=dataset_id,
+                        time='todo',
+                        lang=lang,
+                        macrof1=macrof1,
+                        microf1=microf1,
+                        macrok=macrok,
+                        microk=microk,
+                        notes='')
    print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
--- a/src/models/lstm_class.py
+++ b/src/models/lstm_class.py
@ -27,7 +27,7 @@ class RNNMultilingualClassifier(nn.Module):
        self.n_layers = 1
        self.n_directions = 1

-        self.dropout = nn.Dropout(0.2)
+        self.dropout = nn.Dropout(0.6)

        lstm_out = 256
        ff1 = 512
@ -45,7 +45,7 @@ class RNNMultilingualClassifier(nn.Module):
                llearnable_embeddings[l] = learnable_embeddings
                self.embedding_length = embedding_length

-            # self.rnn = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
+            # self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
            self.rnn = nn.GRU(self.embedding_length, hidden_size)
            self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
            self.lpretrained_embeddings.update(lpretrained_embeddings)
--- a/src/new_mbert.py
+++ b/src/new_mbert.py
@ -0,0 +1,355 @@
+"""
+Test with smaller subset of languages.
+
+1. Load doc (RCV1/2)
+2. Tokenize texts via bertTokenizer (I should already have these dumps)
+3. Construct better Dataloader/Datasets. NB: I need to keep track of the languages only for
+the testing phase (but who cares actually? If I have to do it for the testing phase, I think
+it is better to deploy it also in the training phase...)
+4. ...
+5. I have to understand if the pooled hidden state of the last layer is way worse than its averaged
+version (However, in BertForSeqClassification I guess that the pooled version is passed through
+the output linear layer in order to get the prediction scores?)
+6. At the same time, I have to build also an end-to-end model in order to fine-tune it. The previous step
+would be useful when deploying mBert as a View Generator. (Refactor gFun code with view generators?)
+7. ...
+8. Profits
+
+"""
+from dataset_builder import MultilingualDataset
+from transformers import BertTokenizer, BertForSequenceClassification, AdamW
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import torch
+from util.common import clip_gradient, predict
+from time import time
+from util.csv_log import CSVLog
+from util.evaluation import evaluate
+from util.early_stop import EarlyStopping
+from torch.optim.lr_scheduler import StepLR
+from sklearn.model_selection import train_test_split
+import argparse
+
+
+def get_model(n_out):
+    print('# Initializing model ...')
+    model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
+    return model
+
+def set_method_name():
+    return 'mBERT'
+
+def init_optimizer(model, lr):
+    # return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters()
+                    if not any(nd in n for nd in no_decay)],
+         'weight_decay': opt.weight_decay},
+        {'params': [p for n, p in model.named_parameters()
+                    if any(nd in n for nd in no_decay)],
+         'weight_decay': opt.weight_decay}
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
+    return optimizer
+
+def init_logfile(method_name, opt):
+    logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
+    logfile.set_default('dataset', opt.dataset)
+    logfile.set_default('run', opt.seed)
+    logfile.set_default('method', method_name)
+    assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated'
+    return logfile
+
+def get_lr(optimizer):
+    for param_group in optimizer.param_groups:
+        return param_group['lr']
+
+def get_dataset_name(datapath):
+    possible_splits = [str(i) for i in range(10)]
+    splitted = datapath.split('_')
+    id_split = splitted[-1].split('.')[0][-1]
+    if id_split in possible_splits:
+        dataset_name = splitted[0].split('/')[-1]
+        return f'{dataset_name}_run{id_split}'
+
+def load_datasets(datapath):
+    data = MultilingualDataset.load(datapath)
+    data.set_view(languages=['nl'])   # Testing with just two langs
+    data.show_dimensions()
+
+    l_devel_raw, l_devel_target = data.training(target_as_csr=False)
+    l_test_raw, l_test_target = data.test(target_as_csr=False)
+
+    return l_devel_raw, l_devel_target, l_test_raw, l_test_target
+
+
+def do_tokenization(l_dataset, max_len=512):
+    print('# Starting Tokenization ...')
+    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
+    langs = l_dataset.keys()
+    l_tokenized = {}
+    for lang in langs:
+        l_tokenized[lang] = tokenizer(l_dataset[lang],
+                                      truncation=True,
+                                      max_length=max_len,
+                                      add_special_tokens=True,
+                                      padding='max_length')
+    return l_tokenized
+
+
+class TrainingDataset(Dataset):
+    """
+    data: dict of lang specific tokenized data
+    labels: dict of lang specific targets
+    """
+    def __init__(self, data, labels):
+        self.langs = data.keys()
+        self.lang_ids = {lang:identifier for identifier, lang in enumerate(self.langs)}
+
+        for i, lang in enumerate(self.langs):
+            # print(lang)
+            _data = data[lang]['input_ids']
+            _data = np.array(_data)
+            _labels = labels[lang]
+            _lang_value = np.full(len(_data), self.lang_ids[lang])
+
+            if i == 0:
+                self.data = _data
+                self.labels = _labels
+                self.lang_index = _lang_value
+            else:
+                self.data = np.vstack((self.data, _data))
+                self.labels = np.vstack((self.labels, _labels))
+                self.lang_index = np.concatenate((self.lang_index, _lang_value))
+
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        x = self.data[idx]
+        y = self.labels[idx]
+        lang = self.lang_index[idx]
+
+        return x, torch.tensor(y, dtype=torch.float), lang
+        # return x, y, lang
+
+    def get_lang_ids(self):
+        return self.lang_ids
+
+def freeze_encoder(model):
+    for param in model.base_model.parameters():
+        param.requires_grad = False
+    return model
+
+def check_param_grad_status(model):
+    print('#'*50)
+    print('Model paramater status')
+    for name, child in model.named_children():
+        trainable = False
+        for param in child.parameters():
+            if param.requires_grad:
+                trainable = True
+        if not trainable:
+            print(f'{name} is frozen')
+        else:
+            print(f'{name} is not frozen')
+    print('#'*50)
+
+def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile):
+    _dataset_path = opt.dataset.split('/')[-1].split('_')
+    # dataset_id = 'RCV1/2_run0_newBert'
+    dataset_id = _dataset_path[0] + _dataset_path[-1]
+
+    loss_history = []
+    model.train()
+
+    for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
+        # optim.zero_grad()
+        out = model(batch.cuda())
+        loss = criterion(out[0], target.cuda())
+        loss.backward()
+        clip_gradient(model)
+        optim.step()
+        loss_history.append(loss.item())
+
+        if idx % opt.log_interval == 0:
+            interval_loss = np.mean(loss_history[-opt.log_interval:])
+            print(
+                f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
+
+    mean_loss = np.mean(interval_loss)
+    logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
+    return mean_loss
+
+def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix):
+    print('# Validating model ...')
+    loss_history = []
+    model.eval()
+    langs = lang_ids.keys()
+    id_2_lang = {v:k for k,v in lang_ids.items()}
+    predictions = {l: [] for l in langs}
+    yte_stacked = {l: [] for l in langs}
+
+    for batch, target, lang_idx in test_dataloader:
+        out = model(batch.cuda())
+        logits = out[0]
+        loss = criterion(logits, target.cuda()).item()
+        prediction = predict(logits)
+        loss_history.append(loss)
+
+        # Assigning prediction to dict in predictionS and yte_stacked according to lang_idx
+        for i, pred in enumerate(prediction):
+            lang_pred = id_2_lang[lang_idx.numpy()[i]]
+            predictions[lang_pred].append(pred)
+            yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
+
+    ly = {l: np.vstack(yte_stacked[l]) for l in langs}
+    ly_ = {l: np.vstack(predictions[l]) for l in langs}
+    l_eval = evaluate(ly, ly_)
+    metrics = []
+    for lang in langs:
+        macrof1, microf1, macrok, microk = l_eval[lang]
+        metrics.append([macrof1, microf1, macrok, microk])
+        if measure_prefix == 'te':
+            print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
+    Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
+    print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
+
+    mean_loss = np.mean(loss_history)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
+    logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
+
+    return Mf1
+
+def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
+    l_split_va = l_tokenized_tr
+    l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
+    l_split_tr = l_tokenized_tr
+    l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
+
+    for lang in l_tokenized_tr.keys():
+        val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
+
+        l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[lang] = \
+            train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, random_state=seed, shuffle=True)
+
+    return  l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
+
+def main():
+    print('Running main ...')
+
+    DATAPATH = opt.dataset
+    method_name = set_method_name()
+    logfile = init_logfile(method_name, opt)
+
+    l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
+    l_tokenized_tr = do_tokenization(l_devel_raw, max_len=512)
+
+    l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop=0.2, max_val=2000, seed=opt.seed)
+
+    l_tokenized_te = do_tokenization(l_test_raw, max_len=512)
+
+    tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
+    va_dataset = TrainingDataset(l_split_va, l_split_val_target)
+    te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
+
+    tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
+    va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=False)
+    te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False)
+
+    # Initializing model
+    model = get_model(73)
+    model = model.cuda()
+    criterion = torch.nn.BCEWithLogitsLoss().cuda()
+    optim = init_optimizer(model, lr=opt.lr)
+    # lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
+    early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
+                               checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_dataset_name(opt.dataset)}')
+    # lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optim, num_warmup_steps= , num_training_steps=)
+    # print(model)
+
+    # Freezing encoder
+    # model = freeze_encoder(model)
+    check_param_grad_status(model)
+
+    # Training loop
+    tinit = time()
+    lang_ids = va_dataset.lang_ids
+    for epoch in range(1, opt.nepochs+1):
+        print('# Start Training ...')
+        train(model, tr_dataloader, epoch, criterion, optim, 'TestingBert', tinit, logfile)
+        # lr_scheduler.step(epoch=None) # reduces the learning rate
+
+        # validation
+        macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
+        early_stop(macrof1, epoch)
+        if opt.test_each>0:
+            if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
+                test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
+
+        if early_stop.STOP:
+            print('[early-stop] STOP')
+            if not opt.plotmode:
+                break
+
+    if opt.plotmode==False:
+        print('-' * 80)
+        print('Training over. Performing final evaluation')
+
+        model = early_stop.restore_checkpoint()
+
+        if opt.val_epochs>0:
+            print(f'running last {opt.val_epochs} training epochs on the validation set')
+            for val_epoch in range(1, opt.val_epochs + 1):
+                train(model, va_dataloader, epoch+val_epoch, criterion, optim, 'TestingBert', tinit, logfile)
+
+        # final test
+        print('Training complete: testing')
+        test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
+
+    exit('Code Executed!')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model')
+
+    parser.add_argument('--dataset', type=str, default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
+                        metavar='datasetpath', help=f'path to the pickled dataset')
+    parser.add_argument('--nepochs', type=int, default=200, metavar='int',
+                        help='number of epochs (default: 200)')
+    parser.add_argument('--lr', type=float, default=2e-5, metavar='float',
+                        help='learning rate (default: 2e-5)')
+    parser.add_argument('--weight_decay', type=float, default=0, metavar='float',
+                        help='weight decay (default: 0)')
+    parser.add_argument('--patience', type=int, default=10, metavar='int',
+                        help='patience for early-stop (default: 10)')
+    parser.add_argument('--log-interval', type=int, default=20, metavar='int',
+                        help='how many batches to wait before printing training status')
+    parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str',
+                        help='path to the log csv file')
+    parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
+    parser.add_argument('--force', action='store_true', default=False,
+                        help='do not check if this experiment has already been run')
+    parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str',
+                        help='path to the directory containing checkpoints')
+    parser.add_argument('--plotmode', action='store_true', default=False,
+                        help='in plot mode executes a long run in order '
+                             'to generate enough data to produce trend plots (test-each should be >0. This mode is '
+                             'used to produce plots, and does not perform an evaluation on the test set.')
+    parser.add_argument('--test-each', type=int, default=0, metavar='int',
+                        help='how many epochs to wait before invoking test (default: 0, only at the end)')
+    parser.add_argument('--val-epochs', type=int, default=1, metavar='int',
+                        help='number of training epochs to perform on the validation set once training is over (default 1)')
+    opt = parser.parse_args()
+
+    # Testing different parameters ...
+    opt.weight_decay = 0.01
+    opt.patience = 5
+
+    main()
+    # TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size 
--- a/src/results/results_manager.py
+++ b/src/results/results_manager.py
@ -1,7 +1,11 @@
 import pandas as pd
 import numpy as np

-df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t')
-pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'embed'], aggfunc=[np.mean, np.std])
-print(pivot)
-print('Finished ...')
+# df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t')
+df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t')
+pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std])
+with pd.option_context('display.max_rows', None):
+    print(pivot.round(3))
+print('Finished ...')
+
+
--- a/src/run_mbert_rcv.sh
+++ b/src/run_mbert_rcv.sh
@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
+logfile=../log/log_Mbert_rcv.csv
+
+runs='0 1 2 3 4 5 6 7 8 9'
+for run in $runs
+do
+  dataset=$dataset_path$run.pickle
+  python new_mbert.py --dataset $dataset --log-file $logfile --test-each 20
+done
--- a/src/util/SIF_embed.py
+++ b/src/util/SIF_embed.py
@ -17,7 +17,7 @@ def get_weighted_average(We, x, w):

 def compute_pc(X,npc=1):
    """
-    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
+    Compute the principal components.
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
--- a/src/util/common.py
+++ b/src/util/common.py
@ -1,4 +1,5 @@
 import warnings
+import time
 from sklearn.svm import SVC
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import train_test_split
@ -143,6 +144,15 @@ class Index:

            embedding_parts.append(F)

+        make_dumps = False
+        if make_dumps:
+            print(f'Dumping Embedding Matrices ...')
+            import pickle
+            with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile:
+                pickle.dump((self.lang, embedding_parts, self.word2index), outfile)
+            with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2:
+                pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2)
+
        self.embedding_matrix = torch.cat(embedding_parts, dim=1)

        print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
@ -155,6 +165,7 @@ class MultilingualIndex:
    def __init__(self): #, add_language_trace=False):
        self.l_index = {}
        self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
+        # self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000)
        # self.add_language_trace=add_language_trace

    def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
@ -189,30 +200,42 @@ class MultilingualIndex:
            #     pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1)


-    def posterior_probabilities(self, max_training_docs_by_lang=5000):
+    def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
        # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
+        timeit = time.time()
        lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
        lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
-        for l in self.langs:
-            n_elements = lXtr[l].shape[0]
-            if n_elements > max_training_docs_by_lang:
-                choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
-                lXtr[l] = lXtr[l][choice]
-                lYtr[l] = lYtr[l][choice]
+        if not stored_post:
+            for l in self.langs:
+                n_elements = lXtr[l].shape[0]
+                if n_elements > max_training_docs_by_lang:
+                    choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
+                    lXtr[l] = lXtr[l][choice]
+                    lYtr[l] = lYtr[l][choice]

-        # train the posterior probabilities embedder
-        print('[posteriors] training a calibrated SVM')
-        learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
-        prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
-        prob_embedder.fit(lXtr, lYtr)
+            # train the posterior probabilities embedder
+            print('[posteriors] training a calibrated SVM')
+            learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
+            prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
+            prob_embedder.fit(lXtr, lYtr)

-        # transforms the training, validation, and test sets into posterior probabilities
-        print('[posteriors] generating posterior probabilities')
-        lPtr = prob_embedder.transform(self.get_lXtr())
-        lPva = prob_embedder.transform(self.get_lXva())
-        lPte = prob_embedder.transform(self.get_lXte())
-
-        print('[posteriors] done')
+            # transforms the training, validation, and test sets into posterior probabilities
+            print('[posteriors] generating posterior probabilities')
+            lPtr = prob_embedder.transform(self.get_lXtr())
+            lPva = prob_embedder.transform(self.get_lXva())
+            lPte = prob_embedder.transform(self.get_lXte())
+        # NB: Check splits indices !
+            if store_posteriors:
+                import pickle
+                with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
+                    pickle.dump([lPtr, lPva, lPte], outfile)
+                    print(f'Successfully dumped posteriors!')
+        else:
+            import pickle
+            with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
+                lPtr, lPva, lPte = pickle.load(infile)
+                print(f'Successfully loaded stored posteriors!')
+        print(f'[posteriors] done in {time.time() - timeit}')
        return lPtr, lPva, lPte

    def get_lXtr(self):
--- a/src/util/early_stop.py
+++ b/src/util/early_stop.py
@ -6,7 +6,7 @@ from util.file import create_if_not_exist

 class EarlyStopping:

-    def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
+    def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
        # set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
        self.patience_limit = patience
        self.patience = patience
@ -16,9 +16,10 @@ class EarlyStopping:
        self.stop_time  = None
        self.checkpoint = checkpoint
        self.model = model
+        self.optimizer = optimizer
        self.STOP = False

-    def __call__(self, watch_score, epoch):
+    def __call__(self, watch_score, epoch): #model

        if self.STOP: return #done

@ -29,6 +30,9 @@ class EarlyStopping:
            if self.checkpoint:
                self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
                torch.save(self.model, self.checkpoint)
+                # with open(self.checkpoint)
+                # torch.save({'state_dict': self.model.state_dict(),
+                #             'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint)
            else:
                self.print(f'[early-stop] improved')
            self.patience = self.patience_limit
@ -46,6 +50,7 @@ class EarlyStopping:
        self.patience=self.patience_limit

    def restore_checkpoint(self):
+        print(f'restoring best model from epoch {self.best_epoch}...')
        return torch.load(self.checkpoint)

    def print(self, msg):
--- a/src/util/results.py
+++ b/src/util/results.py
@ -5,8 +5,23 @@ import numpy as np
 class PolylingualClassificationResults:
    def __init__(self, file, autoflush=True, verbose=False):
        self.file = file
-        self.columns = ['method', 'learner', 'embed', 'embed_type', 'pca_s', 'pca_u', 'optimp', 'dataset', 'time',
-                        'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
+        self.columns = ['method',
+                        'learner',
+                        'optimp',
+                        'sif',
+                        'zscore',
+                        'l2',
+                        'wescaler',
+                        'pca',
+                        'id',
+                        'dataset',
+                        'time',
+                        'lang',
+                        'macrof1',
+                        'microf1',
+                        'macrok',
+                        'microk',
+                        'notes']
        self.autoflush = autoflush
        self.verbose = verbose
        if os.path.exists(file):
@ -21,8 +36,8 @@ class PolylingualClassificationResults:
    def already_calculated(self, id):
        return (self.df['id'] == id).any()

-    def add_row(self, method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
-        s = pd.Series([method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
+    def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
+        s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
        self.df = self.df.append(s, ignore_index=True)
        if self.autoflush: self.flush()
        self.tell(s.to_string())
--- a/src/util_transformers/StandardizeTransformer.py
+++ b/src/util_transformers/StandardizeTransformer.py
--- a/src/util_transformers/init.py
+++ b/src/util_transformers/init.py
--- a/src/util_transformers/clesa.py
+++ b/src/util_transformers/clesa.py
--- a/src/util_transformers/dci.py
+++ b/src/util_transformers/dci.py
--- a/src/util_transformers/riboc.py
+++ b/src/util_transformers/riboc.py