refactoring

2021-01-19 10:31:57 +01:00 · 2021-01-19 10:31:57 +01:00 · a5322ba227
parent 515acae15b
commit a5322ba227
9 changed files with 132 additions and 99 deletions
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@ -148,7 +148,7 @@ class MonolingualClassifier:
        if isinstance(self.model, GridSearchCV):
            self.best_params_ = self.model.best_params_
            print('best parameters: ', self.best_params_)
-        self.time=time.time()-tinit
+        self.time = time.time()-tinit
        return self
    def decision_function(self, X):
--- a/src/learning/transformers.py
+++ b/src/learning/transformers.py
@ -84,9 +84,9 @@ class PosteriorProbabilitiesEmbedder:
        self.is_training = is_training
    def fit(self, lX, lY, lV=None, called_by_viewgen=False):
-        if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
+        # if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
-            print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
+        #     print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
-            return self
+        #     return self
        if not called_by_viewgen:
            # Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
            print('### Posterior Probabilities View Generator (X)')
@ -96,20 +96,20 @@ class PosteriorProbabilitiesEmbedder:
    def transform(self, lX):
        # if dir exist, load and return already computed results
-        _endpoint = 'tr' if self.is_training else 'te'
+        # _endpoint = 'tr' if self.is_training else 'te'
-        _actual_path = self.storing_path + '/' + _endpoint
+        # _actual_path = self.storing_path + '/' + _endpoint
-        if exists(_actual_path):
+        # if exists(_actual_path):
-            print('NB: loading pre-computed results!')
+        #     print('NB: loading pre-computed results!')
-            with open(_actual_path + '/X.pickle', 'rb') as infile:
+        #     with open(_actual_path + '/X.pickle', 'rb') as infile:
-                self.is_training = False
+        #         self.is_training = False
-                return pickle.load(infile)
+        #         return pickle.load(infile)
        lZ = self.predict_proba(lX)
        lZ = _normalize(lZ, self.l2)
        # create dir and dump computed results
-        create_if_not_exist(_actual_path)
+        # create_if_not_exist(_actual_path)
-        with open(_actual_path + '/X.pickle', 'wb') as outfile:
+        # with open(_actual_path + '/X.pickle', 'wb') as outfile:
-            pickle.dump(lZ, outfile)
+        #     pickle.dump(lZ, outfile)
        self.is_training = False
        return lZ
@ -154,8 +154,7 @@ class MuseEmbedder:
        MUSE = self.MUSE
        lX = self.featureweight.transform(lX)
        XdotMUSE = Parallel(n_jobs=self.n_jobs)(
-            delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs
+            delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs)
        )
        lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
        lMuse = _normalize(lMuse, self.l2)
        return lMuse
@ -211,18 +210,22 @@ class WordClassEmbedder:
 class MBertEmbedder:
    def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None,
-                 nC=None):
+                 nC=None, avoid_loading=False):
        self.doc_embed_path = doc_embed_path
        self.patience = patience
        self.checkpoint_dir = checkpoint_dir
        self.fitted = False
        self.requires_tfidf = False
-        if path_to_model is None and nC is not None:
+        self.avoid_loading = avoid_loading
        if path_to_model is None:
            self.model = None
        else:
            config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
                                                num_labels=nC)
-            self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda()
+            if self.avoid_loading:
                self.model = None
            else:
                self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda() # TODO: setting model to None in order to avoid loading it onto gpu if we have already pre-computed results!
            self.fitted = True
    def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1):
@ -235,7 +238,7 @@ class MBertEmbedder:
        l_tokenized_tr = do_tokenization(lX, max_len=512)
        l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly,
                                                                                         val_prop=0.2, max_val=2000,
-                                                                                         seed=seed)     # TODO: seed
+                                                                                         seed=seed)
        tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
        va_dataset = TrainingDataset(l_split_va, l_split_val_target)
@ -289,7 +292,7 @@ class MBertEmbedder:
        l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
        feat_dataset = ExtractorDataset(l_tokenized_X)
        feat_lang_ids = feat_dataset.lang_ids
-        dataloader = DataLoader(feat_dataset, batch_size=64)        # TODO reduced batch size in JRC experiments
+        dataloader = DataLoader(feat_dataset, batch_size=64)
        all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
        return all_batch_embeddings
@ -301,7 +304,7 @@ class RecurrentEmbedder:
    def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3,
                 we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10,
-                 test_each=0, checkpoint_dir='../checkpoint', model_path=None):
+                 test_each=0, checkpoint_dir='../checkpoint', model_path=None, n_jobs=-1):
        self.pretrained = pretrained
        self.supervised = supervised
        self.concat = concat
@ -319,6 +322,7 @@ class RecurrentEmbedder:
        self.options = options
        self.seed = options.seed
        self.model_path = model_path
        self.n_jobs = n_jobs
        self.is_trained = False
        ## INIT MODEL for training
@ -398,32 +402,33 @@ class RecurrentEmbedder:
    def transform(self, lX, batch_size=64):
        lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
-        lX = self._get_doc_embeddings(lX)
+        lX = self._get_doc_embeddings(lX, batch_size)
        return lX
    def fit_transform(self, lX, ly, lV=None):
        return self.fit(lX, ly).transform(lX)
-    def _get_doc_embeddings(self, lX, batch_size=64):
+    def _get_doc_embeddings(self, lX, batch_size):
        assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
        print('Generating document embeddings via GRU')
        _lX = {}
        l_devel_target = self.multilingual_index.l_devel_target()
        # show_gpu('RNN init at extraction')
        for idx, (batch, post, target, lang) in enumerate(batchify(lX, None, l_devel_target,
                                                                   batch_size, self.multilingual_index.l_pad())):
            if lang not in _lX.keys():
                _lX[lang] = self.model.get_embeddings(batch, lang)
            else:
                _lX[lang] = np.concatenate((_lX[lang], self.model.get_embeddings(batch, lang)), axis=0)
-
+            # show_gpu('RNN after batch pred at extraction')
        return _lX
    # loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
    def _load_pretrained_embeddings(self, we_path, langs):
        lpretrained = lpretrained_vocabulary = self._none_dict(langs)
-        lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
+        lpretrained = load_muse_embeddings(we_path, langs, n_jobs=self.n_jobs)
        lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
        return lpretrained, lpretrained_vocabulary
@ -553,20 +558,20 @@ class FeatureSet2Posteriors:
    def transform(self, lX):
        # if dir exist, load and return already computed results
-        _endpoint = 'tr' if self.is_training else 'te'
+        # _endpoint = 'tr' if self.is_training else 'te'
-        _actual_path = self.storing_path + '/' + _endpoint
+        # _actual_path = self.storing_path + '/' + _endpoint
-        if exists(_actual_path):
+        # if exists(_actual_path):
-            print('NB: loading pre-computed results!')
+        #     print('NB: loading pre-computed results!')
-            with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile:
+        #     with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile:
-                self.is_training = False
+        #         self.is_training = False
-                return pickle.load(infile)
+        #         return pickle.load(infile)
        lP = self.predict_proba(lX)
        lP = _normalize(lP, self.l2)
        # create dir and dump computed results
-        create_if_not_exist(_actual_path)
+        # create_if_not_exist(_actual_path)
-        with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile:
+        # with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile:
-            pickle.dump(lP, outfile)
+        #     pickle.dump(lP, outfile)
        self.is_training = False
        return lP
@ -637,7 +642,13 @@ class Funnelling:
        self.meta = meta
        self.n_jobs = meta.n_jobs
-    def fit(self, lX, ly):
+    def fit(self, lX, ly, target_lang=None):
        if target_lang is not None:
            LX = lX.copy()
            LX.update(target_lang)
            self.vectorizer.fit(LX)
            tfidf_lX = self.vectorizer.transform(lX)
        else:
            tfidf_lX = self.vectorizer.fit_transform(lX, ly)
        lV = self.vectorizer.vocabulary()
        print('## Fitting first-tier learners!')
@ -774,6 +785,7 @@ def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, opt
    _dataset_path = opt.dataset.split('/')[-1].split('_')
    dataset_id = _dataset_path[0] + _dataset_path[-1]
    # show_gpu('RNN init pre-training')
    loss_history = []
    model.train()
    for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
@ -783,6 +795,7 @@ def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, opt
        clip_gradient(model)
        optim.step()
        loss_history.append(loss.item())
        # show_gpu('RNN after batch prediction')
        if idx % log_interval == 0:
            interval_loss = np.mean(loss_history[-log_interval:])
--- a/src/main_gFun.py
+++ b/src/main_gFun.py
@ -23,20 +23,21 @@ if __name__ == '__main__':
    results = PolylingualClassificationResults('../log/' + op.output)
    allprob = 'Prob' if op.allprob else ''
    # renaming arguments to be printed on log
    method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert,
                                                op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob)
    print(f'Method: gFun{method_name}\nDataset: {dataset_name}')
    print('-'*50)
-    # set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect
+    n_jobs = -1  # TODO SETTING n_JOBS
    standardize_range = slice(0, 0)
    if op.zscore:
        standardize_range = None
    # load dataset
    data = MultilingualDataset.load(dataset)
-    data.set_view(languages=['nl', 'it'])   # TODO: DEBUG SETTING
+    # data.set_view(languages=['it'])   # TODO: DEBUG SETTING
    data.show_dimensions()
    lXtr, lytr = data.training()
    lXte, lyte = data.test()
@ -63,7 +64,7 @@ if __name__ == '__main__':
        doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
                                                                                          kernel='linear',
                                                                                          C=op.set_c),
-                                                           l2=l2, storing_path=storing_path))
+                                                           l2=l2, storing_path=storing_path, n_jobs=n_jobs))
    if op.supervised:
        """ 
@ -73,9 +74,11 @@ if __name__ == '__main__':
        VG_name = 'W'
        storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
        exist = exists(storing_path)
-        wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
+        wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting,
                                sif=op.sif, n_jobs=n_jobs)
        if op.allprob:
-            wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
+            wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path,
                                        n_jobs=n_jobs)
        doc_embedder.append(wce)
    if op.pretrained:
@ -86,9 +89,10 @@ if __name__ == '__main__':
        VG_name = 'M'
        storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
        exist = exists(storing_path)
-        muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
+        muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif, n_jobs=n_jobs)
        if op.allprob:
-            muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
+            muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path,
                                         n_jobs=n_jobs)
        doc_embedder.append(muse)
    if op.gruViewGenerator:
@ -100,12 +104,12 @@ if __name__ == '__main__':
        VG_name = 'G'
        VG_name += '_muse' if op.gruMUSE else ''
        VG_name += '_wce' if op.gruWCE else ''
-        storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
+        storing_path = 'Nope' # f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
        rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
-                                         options=op, model_path=op.gru_path)
+                                         options=op, model_path=None, n_jobs=n_jobs)
        if op.allprob:
            rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False,
-                                                 storing_path=storing_path)
+                                                 storing_path=storing_path, n_jobs=n_jobs)
        doc_embedder.append(rnn_embedder)
    if op.mbert:
@ -114,8 +118,9 @@ if __name__ == '__main__':
        """
        VG_name = 'B'
        storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
        avoid_loading = False if op.avoid_loading else True # TODO research setting (set to false mBert will be loaded into gpu to get doc emebds (aka, only the first time for each run))
-        mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories())
+        mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories(), avoid_loading=avoid_loading)
        if op.allprob:
            mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path)
        doc_embedder.append(mbert)
@ -123,7 +128,7 @@ if __name__ == '__main__':
    # metaclassifier
    meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
    meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c),
-                          meta_parameters=get_params(op.optimc), standardize_range=standardize_range)
+                          meta_parameters=get_params(op.optimc), standardize_range=standardize_range, n_jobs=n_jobs)
    # ensembling the modules
    classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
--- a/src/models/mBert.py
+++ b/src/models/mBert.py
@ -20,9 +20,6 @@ def predict(logits, classification_type='multilabel'):
 class TrainingDataset(Dataset):
    """
    data: dict of lang specific tokenized data
    """
    def __init__(self, data, labels):
        self.langs = data.keys()
@ -231,7 +228,7 @@ def feature_extractor(data, lang_ids, model):
    Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for 
    the output of each layer) of shape (batch_size, sequence_length, hidden_size)
    """
-    show_gpu('Before Training')
+    # show_gpu('Before Training')
    all_batch_embeddings = {}
    id2lang = {v: k for k, v in lang_ids.items()}
    with torch.no_grad():
@ -246,5 +243,5 @@ def feature_extractor(data, lang_ids, model):
                else:
                    all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
                                                                      batch_embeddings[i].detach().cpu().numpy()))
-        show_gpu('After Full Prediction')
+        # show_gpu('After Full Prediction')
    return all_batch_embeddings, id2lang
--- a/src/util/common.py
+++ b/src/util/common.py
@ -74,7 +74,7 @@ class Index:
        self.test_raw = test_raw
    def index(self, pretrained_vocabulary, analyzer, vocabulary):
-        self.word2index = dict(vocabulary)
+        self.word2index = dict(vocabulary)  # word2idx
        known_words = set(self.word2index.keys())
        if pretrained_vocabulary is not None:
            known_words.update(pretrained_vocabulary)
@ -207,44 +207,6 @@ class MultilingualIndex:
            index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
            self.sup_range = index.wce_range
    # TODO circular import with transformers --> when generating posterior prob, we import PosteriorProbabilitiesEmbedder which is defined in transformers
    # def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
    #     # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
    #     timeit = time.time()
    #     lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
    #     lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
    #     if not stored_post:
    #         for l in self.langs:
    #             n_elements = lXtr[l].shape[0]
    #             if n_elements > max_training_docs_by_lang:
    #                 choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
    #                 lXtr[l] = lXtr[l][choice]
    #                 lYtr[l] = lYtr[l][choice]
    #
    #         # train the posterior probabilities embedder
    #         print('[posteriors] training a calibrated SVM')
    #         learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
    #         prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
    #         prob_embedder.fit(lXtr, lYtr)
    #
    #         # transforms the training, validation, and test sets into posterior probabilities
    #         print('[posteriors] generating posterior probabilities')
    #         lPtr = prob_embedder.transform(self.get_lXtr())
    #         lPva = prob_embedder.transform(self.get_lXva())
    #         lPte = prob_embedder.transform(self.get_lXte())
    #     # NB: Check splits indices !
    #         if store_posteriors:
    #             import pickle
    #             with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
    #                 pickle.dump([lPtr, lPva, lPte], outfile)
    #                 print(f'Successfully dumped posteriors!')
    #     else:
    #         import pickle
    #         with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
    #             lPtr, lPva, lPte = pickle.load(infile)
    #             print(f'Successfully loaded stored posteriors!')
    #     print(f'[posteriors] done in {time.time() - timeit}')
    #     return lPtr, lPva, lPte
    def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False):
        show_gpu('GPU memory before initializing mBert model:')
@ -518,10 +480,12 @@ class TfidfVectorizerMultilingual:
    def fit(self, lX, ly=None):
        self.langs = sorted(lX.keys())
        self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
        # self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in lX.keys()}
        return self
    def transform(self, lX):
        return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
        # return {l: self.vectorizer[l].transform(lX[l]) for l in lX.keys()}
    def fit_transform(self, lX, ly=None):
        return self.fit(lX, ly).transform(lX)
@ -568,3 +532,11 @@ def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru,
    dataset_id = _dataset_path[0] + _dataset_path[-1]
    return _id, dataset_id
 def get_zscl_setting(langs):
    settings = []
    for elem in langs:
        for tar in langs:
            if elem != tar:
                settings.append((elem, tar))
    return settings
--- a/src/util/metrics.py
+++ b/src/util/metrics.py
@ -1,5 +1,4 @@
 import numpy as np
 import numpy as np
 from scipy.sparse import lil_matrix, issparse
 from sklearn.metrics import f1_score, accuracy_score
--- a/src/util/parser_options.py
+++ b/src/util/parser_options.py
@ -60,6 +60,9 @@ parser.add_option("-a", "--agg", dest="agg", action='store_true',
                  help="Set aggregation function of the common Z-space to average (Default: concatenation)",
                  default=True)
 parser.add_option("-l", dest="avoid_loading", action="store_true",
                  help="TODO", default=False)
 # ------------------------------------------------------------------------------------
 parser.add_option('--hidden', type=int, default=512, metavar='int',
--- a/src/util/results.py
+++ b/src/util/results.py
@ -47,3 +47,46 @@ class PolylingualClassificationResults:
    def tell(self, msg):
        if self.verbose: print(msg)
 class ZSCLResults:
    def __init__(self, file, autoflush=True, verbose=False):
        self.file = file
        self.columns = ['method',
                        'optimp',
                        'source',
                        'target',
                        'id',
                        'dataset',
                        'time',
                        'lang',
                        'macrof1',
                        'microf1',
                        'macrok',
                        'microk',
                        'notes']
        self.autoflush = autoflush
        self.verbose = verbose
        if os.path.exists(file):
            self.tell('Loading existing file from {}'.format(file))
            self.df = pd.read_csv(file, sep='\t')
        else:
            self.tell('File {} does not exist. Creating new frame.'.format(file))
            dir = os.path.dirname(self.file)
            if dir and not os.path.exists(dir): os.makedirs(dir)
            self.df = pd.DataFrame(columns=self.columns)
    def already_calculated(self, id):
        return (self.df['id'] == id).any()
    def add_row(self, method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
        s = pd.Series([method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
        self.df = self.df.append(s, ignore_index=True)
        if self.autoflush: self.flush()
        self.tell(s.to_string())
    def flush(self):
        self.df.to_csv(self.file, index=False, sep='\t')
    def tell(self, msg):
        if self.verbose: print(msg)
--- a/src/util_transformers/StandardizeTransformer.py
+++ b/src/util_transformers/StandardizeTransformer.py
@ -1,5 +1,6 @@
 import numpy as np
 class StandardizeTransformer:
    def __init__(self, axis=0, range=None):