implemented method to compute WCE only for well represented classes;

refactored MLE class in order to support WCE, standard embeddings and combinations; sketched out NN implementation for WE compositionality; still TODO SIF embeddings;
2020-01-07 17:05:41 +01:00 · 2020-01-07 17:05:41 +01:00 · 53198a7e2c
parent 0e66fbf197
commit 53198a7e2c
10 changed files with 498 additions and 37 deletions
--- a/src/FPEC_andrea.py
+++ b/src/FPEC_andrea.py
@ -6,7 +6,7 @@ from optparse import OptionParser
 from util.file import exists
 from util.results import PolylingualClassificationResults
 from sklearn.svm import SVC
-
+from util.util import get_learner, get_params
 parser = OptionParser()
@ -115,7 +115,7 @@ if __name__ == '__main__':
        config = {'unsupervised': True,
                  'supervised': True,
                  'we_type': op.we_type}
-        _config_id = 'M_and_F'
+        _config_id = 'M+F'
    config['reduction'] = 'PCA'
    config['max_label_space'] = op.max_labels_S
@ -125,32 +125,6 @@ if __name__ == '__main__':
    result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
    PLE_test = True
    if PLE_test:
        ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/',
                                              config = config,
                                              learner=get_learner(calibrate=False),
                                              c_parameters=get_params(dense=False),
                                              n_jobs=op.n_jobs)
        print('# Fitting ...')
        ple.fit(lXtr, lytr)
        print('# Evaluating ...')
        ple_eval = evaluate_method(ple, lXte, lyte)
        metrics = []
        for lang in lXte.keys():
            macrof1, microf1, macrok, microk = ple_eval[lang]
            metrics.append([macrof1, microf1, macrok, microk])
            print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
            results.add_row('MLE', 'svm', 'no', config['we_type'],
                            'no','no', op.optimc, op.dataset.split('/')[-1], ple.time,
                            lang, macrof1, microf1, macrok, microk, '')
        print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
        exit()
    print(f'### PolyEmbedd_andrea_{_config_id}\n')
    classifier = AndreaCLF(we_path=op.we_path,
                           config=config,
@ -174,5 +148,5 @@ if __name__ == '__main__':
        results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
                        (config['max_label_space'], classifier.best_components),
                        config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
-                        lang, macrof1, microf1, macrok, microk, 'min_prevalence = 0')
+                        lang, macrof1, microf1, macrok, microk, '')
    print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
--- a/src/MLE_andrea.py
+++ b/src/MLE_andrea.py
@ -0,0 +1,128 @@
 import os
 from dataset_builder import MultilingualDataset
 from learning.learners import *
 from util.evaluation import *
 from optparse import OptionParser
 from util.file import exists
 from util.results import PolylingualClassificationResults
 from util.util import get_learner, get_params
 parser = OptionParser()
 parser.add_option("-d", "--dataset", dest="dataset",
                  help="Path to the multilingual dataset processed and stored in .pickle format",
                  default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
 parser.add_option("-o", "--output", dest="output",
                  help="Result file", type=str,  default='./results/results.csv')
 parser.add_option("-e", "--mode-embed", dest="mode_embed",
                  help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
 parser.add_option("-w", "--we-path", dest="we_path",
                  help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
 parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
                  default='MUSE')
 parser.add_option("-s", "--set_c", dest="set_c",type=float,
                  help="Set the C parameter", default=1)
 parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
                  help="Optimize hyperparameters", default=False)
 parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
                  help="Number of parallel jobs (default is -1, all)", default=-1)
 parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
                  help="If smaller than number of target classes, PCA will be applied to supervised matrix. "
                       "If set to 0 it will automatically search for the best number of components. "
                       "If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)",
                  default=300)
 parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
                  help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
                       " If set to 0 it will automatically search for the best number of components", default=300)
 parser.add_option("-l", dest="lang", type=str)
 if __name__ == '__main__':
    (op, args) = parser.parse_args()
    assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
    assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
    dataset_file = os.path.basename(op.dataset)
    results = PolylingualClassificationResults('./results/PLE_results.csv')
    data = MultilingualDataset.load(op.dataset)
    data.show_dimensions()
    # data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
    # data.set_view(languages=[op.lang])
    # data.set_view(categories=list(range(10)))
    lXtr, lytr = data.training()
    lXte, lyte = data.test()
    if op.set_c != -1:
        meta_parameters = None
    else:
        meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
    # Embeddings and WCE config
    _available_mode = ['none', 'unsupervised', 'supervised', 'both']
    _available_type = ['MUSE', 'FastText']
    assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}'
    assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}'
    if op.mode_embed == 'none':
        config = {'unsupervised': False,
                  'supervised': False,
                  'we_type': None}
        _config_id = 'None'
    elif op.mode_embed == 'unsupervised':
        config = {'unsupervised': True,
                  'supervised': False,
                  'we_type': op.we_type}
        _config_id = 'M'
    elif op.mode_embed == 'supervised':
        config = {'unsupervised': False,
                  'supervised': True,
                  'we_type': None}
        _config_id = 'F'
    elif op.mode_embed == 'both':
        config = {'unsupervised': True,
                  'supervised': True,
                  'we_type': op.we_type}
        _config_id = 'M+F'
    config['reduction'] = 'PCA'
    config['max_label_space'] = op.max_labels_S
    config['dim_reduction_unsupervised'] = op.max_labels_U
    # config['post_pca'] = op.post_pca
    # config['plot_covariance_matrices'] = True
    result_id = dataset_file + 'MLE_andrea' + _config_id + ('_optimC' if op.optimc else '')
    ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/',
                                          config = config,
                                          learner=get_learner(calibrate=False),
                                          c_parameters=get_params(dense=False),
                                          n_jobs=op.n_jobs)
    print('# Fitting ...')
    ple.fit(lXtr, lytr)
    print('# Evaluating ...')
    ple_eval = evaluate_method(ple, lXte, lyte)
    metrics = []
    for lang in lXte.keys():
        macrof1, microf1, macrok, microk = ple_eval[lang]
        metrics.append([macrof1, microf1, macrok, microk])
        print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
        results.add_row('MLE', 'svm', _config_id, config['we_type'],
                        'no','no', op.optimc, op.dataset.split('/')[-1], ple.time,
                        lang, macrof1, microf1, macrok, microk, '')
    print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
--- a/src/NN_FPEC_andrea.py
+++ b/src/NN_FPEC_andrea.py
@ -0,0 +1,92 @@
 from optparse import OptionParser
 from util.results import PolylingualClassificationResults
 from dataset_builder import MultilingualDataset
 from keras.preprocessing.text import Tokenizer
 from learning.learners import MonolingualNetSvm
 from sklearn.svm import SVC
 import pickle
 parser = OptionParser()
 parser.add_option("-d", "--dataset", dest="dataset",
                  help="Path to the multilingual dataset processed and stored in .pickle format",
                  default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
 parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
                  help="Optimize hyperparameters", default=False)
 parser.add_option("-s", "--set_c", dest="set_c",type=float,
                  help="Set the C parameter", default=1)
 (op, args) = parser.parse_args()
 ###################################################################################################################
 def get_learner(calibrate=False, kernel='linear'):
    return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
 def get_params(dense=False):
    if not op.optimc:
        return None
    c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
    kernel = 'rbf' if dense else 'linear'
    return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
 # PREPROCESS TEXT AND SAVE IT ... both for SVM and NN
 def preprocess_data(lXtr, lXte, lytr, lyte):
    tokenized_tr = dict()
    tokenized_te = dict()
    for lang in lXtr.keys():
        alltexts = ' '.join(lXtr[lang])
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(alltexts.split(' '))
        tokenizer.oov_token = len(tokenizer.word_index)+1
        # dumping train set
        sequences_tr = tokenizer.texts_to_sequences(lXtr[lang])
        tokenized_tr[lang] = (tokenizer.word_index, sequences_tr, lytr[lang])
        # dumping test set
        sequences_te = tokenizer.texts_to_sequences(lXte[lang])
        tokenized_te[lang] = (tokenizer.word_index, sequences_te, lyte[lang])
    with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'wb') as f:
        pickle.dump(tokenized_tr, f)
    with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_test.pickle', 'wb') as f:
        pickle.dump(tokenized_tr, f)
    print('Successfully dumped data')
 # def load_preprocessed():
 #     with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
 #         return pickle.load(f)
 #
 # def build_embedding_matrix(lang, word_index):
 #     type = 'MUSE'
 #     path = '/home/andreapdr/CLESA/'
 #     MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
 #     return MUSE
 ########## MAIN #################################################################################################
 if __name__ == '__main__':
    results = PolylingualClassificationResults('./results/NN_FPEC_results.csv')
    data = MultilingualDataset.load(op.dataset)
    lXtr, lytr = data.training()
    lXte, lyte = data.test()
    if op.set_c != -1:
        meta_parameters = None
    else:
        meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
    test_architecture = MonolingualNetSvm(lXtr,
                                          lytr,
                                          first_tier_learner=get_learner(calibrate=True),
                                          first_tier_parameters=None,
                                          n_jobs=1)
    test_architecture.fit()
--- a/src/learning/embeddings.py
+++ b/src/learning/embeddings.py
@ -3,8 +3,9 @@ import pickle
 from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
-from data.supervised import get_supervised_embeddings
+from learning.supervised import get_supervised_embeddings
 from util.decompositions import *
 from util.SIF_embed import *
 class PretrainedEmbeddings(ABC):
@ -233,7 +234,6 @@ class StorageEmbeddings:
            print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...')
            langs = list(docs.keys())
            well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs])
            # lY = {lY[lang][:, well_repr_cats] for lang in langs}  TODO not clear
            for lang in langs:
                labels[lang] = labels[lang][:, well_repr_cats]
            print(f'Target number reduced to: {labels[lang].shape[1]}\n')
@ -245,15 +245,15 @@ class StorageEmbeddings:
            nC = self.lang_S[lang].shape[1]
            print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
-        if max_label_space == 0:
+        if max_label_space == 0:    # looking for best n_components analyzing explained_variance_ratio
            print(f'Computing optimal number of PCA components along matrices S')
            optimal_n = get_optimal_dim(self.lang_S, 'S')
            print(f'Applying PCA(n_components={optimal_n})')
            self.lang_S = run_pca(optimal_n, self.lang_S)
-        elif max_label_space == -1:
+        elif max_label_space == -1: # applying pca to the verticals stacked matrix of WCE embeddings
            print(f'Computing PCA on vertical stacked WCE embeddings')
            languages = self.lang_S.keys()
-            _temp_stack = np.vstack([self.lang_S[lang] for lang in languages])
+            _temp_stack = np.vstack([self.lang_S[lang] for lang in languages])  # stacking WCE vertically
            stacked_pca = PCA(n_components=_temp_stack.shape[1])
            stacked_pca.fit(_temp_stack)
            best_n = None
@ -271,12 +271,15 @@ class StorageEmbeddings:
            print(f'Applying PCA(n_components={i}')
            for lang in languages:
                self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
-        elif max_label_space <= nC: # also equal in order to reduce it to the same initial dimension
+        elif max_label_space <= nC: # less or equal in order to reduce it to the same initial dimension
            print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})')
            self.lang_S = run_pca(max_label_space, self.lang_S)
        return
    def SIF_embeddings(self):
        print('todo') # TODO
    def _concatenate_embeddings(self, docs):
        _r = dict()
        for lang in self.lang_U.keys():
@ -293,6 +296,9 @@ class StorageEmbeddings:
    def predict(self, config, docs):
        if config['supervised'] and config['unsupervised']:
            return self._concatenate_embeddings(docs)
            # todo testing applying pca to hstack muse + wce
            # _reduced = self._concatenate_embeddings(docs)
            # return run_pca(300, _reduced)
        elif config['supervised']:
            _r = dict()
            for lang in docs.keys():
@ -301,4 +307,5 @@ class StorageEmbeddings:
            _r = dict()
            for lang in docs.keys():
                _r[lang] = docs[lang].dot(self.lang_U[lang])
        return _r
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@ -1,6 +1,6 @@
 import numpy as np
 import time
-from data.embeddings import WordEmbeddings, StorageEmbeddings
+from learning.embeddings import WordEmbeddings, StorageEmbeddings
 from scipy.sparse import issparse
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.model_selection import GridSearchCV
@ -9,6 +9,7 @@ from joblib import Parallel, delayed
 from sklearn.feature_extraction.text import TfidfVectorizer
 from transformers.StandardizeTransformer import StandardizeTransformer
 from sklearn.decomposition import PCA
 from models.cnn_class import CNN_pdr
 def _sort_if_sparse(X):
@ -581,3 +582,151 @@ class PolylingualEmbeddingsClassifier:
    def best_params(self):
        return self.model.best_params()
 class MonolingualNetSvm:
    """
    testing: funnelling with NN managing word embeddings compositionality. An ensemble of n-SVMs (n equals to the
    number of training languages) is first fit on the data, generating the documents projection in the Z-space. Next,
    the projection are fed to a single NN with their respective document embeddings. The documents are projected into
    the embedding space while preserving their dimensionality (output dim is 300). These projection are horizonatally
    concatenated with the respective projection and passed through a fC layer with sigmoid act and output dim equal
    to the number of target classes.
    # TODO ATM testing with only 1 language
    """
    def __init__(self, lX, ly, first_tier_learner, first_tier_parameters, n_jobs):
        self.lX = lX
        self.ly = ly
        # SVM Attributes
        self.doc_projector = NaivePolylingualClassifier(first_tier_learner, first_tier_parameters,
                                                        n_jobs=n_jobs)
        self.calmode = 'cal'
        self.languages = []
        self.lang_word2idx = dict()
        self.lang_tfidf = {}
        self.base_learner = 'TODO'
        self.parameters = 'TODO'
        # NN Attributes
        self.NN = 'TODO'
    def load_preprocessed(self):
        """
        in order to speed up the process, documents are first tokenized in the "main". Here, tokenized docs, word_index, and
        targets are loaded.
        :return: dict[lang] = (word_index, tokenized_docs, targets)
        """
        import pickle
        with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
            return pickle.load(f)
    def _build_embedding_matrix(self, lang, word_index):
        """
        build embedding matrix by filtering out OOV embeddings
        :param lang:
        :param word_index:
        :return: filtered embedding matrix
        """
        from learning.embeddings import EmbeddingsAligned
        type = 'MUSE'
        path = '/home/andreapdr/CLESA/'
        MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
        return MUSE
    def get_data_and_embed(self, data_dict):
        from keras.preprocessing.sequence import pad_sequences
        langs = data_dict.keys()
        lang_embedding_matrix = dict()
        nn_lXtr = dict()
        nn_lytr = dict()
        for lang in langs:
            lang_embedding_matrix[lang] = self._build_embedding_matrix(lang, data_dict[lang][0])
            nn_lXtr[lang] = pad_sequences(data_dict[lang][1], 100, padding='post')
            nn_lytr[lang] = [data_dict[lang][2]]
        return  nn_lXtr, nn_lytr, lang_embedding_matrix
    def svm_vectorize(self, lX, prediction=False):
        langs = list(lX.keys())
        print(f'# tfidf-vectorizing docs')
        if prediction:
            for lang in langs:
                assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
                tfidf_vectorizer = self.lang_tfidf[lang]
                lX[lang] = tfidf_vectorizer.transform(lX[lang])
            return self
        for lang in langs:
            tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
            self.languages.append(lang)
            tfidf_vectorizer.fit(lX[lang])
            lX[lang] = tfidf_vectorizer.transform(lX[lang])
            self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
            self.lang_tfidf[lang] = tfidf_vectorizer
        return lX
    def _get_zspace(self, lXtr, lYtr):
        print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
        self.doc_projector.fit(lXtr, lYtr)
        print('\nprojecting the documents')
        lZ = self._projection(self.doc_projector, lXtr)
        return lZ, lYtr
    def _projection(self, doc_projector, lX):
        """
        Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
        decision_function if otherwise
        :param doc_projector: the document projector (a NaivePolylingualClassifier)
        :param lX: {lang:matrix} to train
        :return: the projection, applied with predict_proba or decision_function
        """
        if self.calmode=='cal':
            return doc_projector.predict_proba(lX)
        else:
            l_decision_scores = doc_projector.decision_function(lX)
            if self.calmode=='sigmoid':
                def sigmoid(x): return 1 / (1 + np.exp(-x))
                for lang in l_decision_scores.keys():
                    l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
            return l_decision_scores
    def fit(self):
        """
        # 1. Fit SVM to generate posterior probabilities:
        #   1.1 Gather documents and vectorize them as in other SVM classifiers
        # 2. Fit NN
        #   2.1 Gather documents and build NN dataset by indexing wrt embedding matrix
        #   2.2 Fit NN first-layer to generate compositional doc embedding
        #   2.3 H-stack doc-embed and posterior P
        #   2.4 Feed stacked vector to output layer (sigmoid act): output Nc
        #   2.5 Train it...
        """
        # load pre-processed data
        data_dict = self.load_preprocessed()
        # build embedding matrices and neural network document training set
        nn_lXtr, nn_lytr, lang_embedding_matrix = self.get_data_and_embed(data_dict)
        # TF-IDF vectorzing documents for SVM classifier
        svm_lX = self.svm_vectorize(self.lX)
        # just testing on a smaller subset of data
        test_svm_lX = dict()
        test_svm_ly = dict()
        test_svm_lX['it'] = svm_lX['it'][:10, :]
        test_svm_ly['it'] = self.ly['it'][:10, :]
        test_nn_data = nn_lXtr['it'][:10]
        # projecting document into Z space by SVM
        svm_Z, _ = self._get_zspace(test_svm_lX, test_svm_ly)
        # initializing net and forward pass
        net = CNN_pdr(73, 1, 300, len(lang_embedding_matrix['it'].vectors), 300, lang_embedding_matrix['it'].vectors)
        out = net.forward(test_nn_data, svm_Z['it'])
        print('TODO')
    def net(self):
        pass
--- a/src/learning/supervised.py
+++ b/src/learning/supervised.py
--- a/src/models/cnn_class.py
+++ b/src/models/cnn_class.py
@ -0,0 +1,42 @@
 import torch.nn as nn
 from torch.nn import functional as F
 import torch
 class CNN_pdr(nn.Module):
    def __init__(self, output_size, out_channels, compositional_dim, vocab_size, emb_dim, embeddings=None, drop_embedding_range=None,
                 drop_embedding_prop=0, drop_prob=0.5):
        super(CNN_pdr, self).__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.embeddings = torch.FloatTensor(embeddings)
        self.embedding_layer = nn.Embedding(vocab_size, emb_dim, _weight=self.embeddings)
        self.kernel_heights = kernel_heights=[3,5,7]
        self.stride = 1
        self.padding = 0
        self.drop_embedding_range = drop_embedding_range
        self.drop_embedding_prop = drop_embedding_prop
        assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
        self.nC = 73
        self.conv1 = nn.Conv2d(1, compositional_dim, (self.kernel_heights[0], self.emb_dim), self.stride, self.padding)
        self.dropout = nn.Dropout(drop_prob)
        self.label = nn.Linear(len(kernel_heights) * out_channels, output_size)
        self.fC = nn.Linear(compositional_dim + self.nC, self.nC)
    def forward(self, x, svm_output):
        x = torch.LongTensor(x)
        svm_output = torch.FloatTensor(svm_output)
        x = self.embedding_layer(x)
        x = self.conv1(x.unsqueeze(1))
        x = F.relu(x.squeeze(3))
        x = F.max_pool1d(x, x.size()[2]).squeeze(2)
        x = torch.cat((x, svm_output), 1)
        x = F.sigmoid(self.fC(x))
        return x    #.detach().numpy()
        # logits = self.label(x)
        # return logits
--- a/src/results/results_manager.py
+++ b/src/results/results_manager.py
@ -2,6 +2,6 @@ import pandas as pd
 import numpy as np
 df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t')
-pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['embed'], aggfunc=[np.mean, np.std])
+pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'embed'], aggfunc=[np.mean, np.std])
 print(pivot)
 print('Finished ...')
--- a/src/util/SIF_embed.py
+++ b/src/util/SIF_embed.py
@ -0,0 +1,56 @@
 import numpy as np
 from sklearn.decomposition import TruncatedSVD
 def get_weighted_average(We, x, w):
    """
    Compute the weighted average vectors
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in sentence i
    :param w: w[i, :] are the weights for the words in sentence i
    :return: emb[i, :] are the weighted average vector for sentence i
    """
    n_samples = x.shape[0]
    emb = np.zeros((n_samples, We.shape[1]))
    for i in range(n_samples):
        emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:])
    return emb
 def compute_pc(X,npc=1):
    """
    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(X)
    return svd.components_
 def remove_pc(X, npc=1):
    """
    Remove the projection on the principal components
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: XX[i, :] is the data point after removing its projection
    """
    pc = compute_pc(X, npc)
    if npc==1:
        XX = X - X.dot(pc.transpose()) * pc
    else:
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX
 def SIF_embedding(We, x, w, params):
    """
    Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in the i-th sentence
    :param w: w[i, :] are the weights for the words in the i-th sentence
    :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
    :return: emb, emb[i, :] is the embedding for sentence i
    """
    emb = get_weighted_average(We, x, w)
    if  params.rmpc > 0:
        emb = remove_pc(emb, params.rmpc)
    return emb
--- a/src/util/util.py
+++ b/src/util/util.py
@ -1,2 +1,15 @@
 from sklearn.svm import SVC
 def fill_missing_classes(lXtr, lytr):
    pass
 def get_learner(calibrate=False, kernel='linear'):
    return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
 def get_params(dense=False):
    if not op.optimc:
        return None
    c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
    kernel = 'rbf' if dense else 'linear'
    return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]