typos + requirements.txt

2021-01-26 12:49:28 +01:00 · 2021-01-26 12:49:28 +01:00 · 5958df3e3c
parent 90e974f0a3
commit 5958df3e3c
6 changed files with 24 additions and 20 deletions
--- a/refactor/main.py
+++ b/refactor/main.py
@ -46,7 +46,6 @@ def main(args):

    if args.bert_embedder:
        bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs)
-        bertEmbedder.transform(lX)
        embedder_list.append(bertEmbedder)

    # Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier
--- a/refactor/models/pl_bert.py
+++ b/refactor/models/pl_bert.py
@ -22,8 +22,7 @@ class BertModel(pl.LightningModule):
        self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
        self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
        self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
-        # Language specific metrics - I am not really sure if they should be initialized
-        # independently or we can use the metrics init above... # TODO: check it
+        # Language specific metrics to compute metrics at epoch level
        self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
        self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
        self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
@ -71,7 +70,6 @@ class BertModel(pl.LightningModule):
        langs = set(langs)
        # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
        # here we save epoch level metric values and compute them specifically for each language
-        # TODO: make this a function (reused in pl_gru epoch_end)
        res_macroF1 = {lang: [] for lang in langs}
        res_microF1 = {lang: [] for lang in langs}
        res_macroK = {lang: [] for lang in langs}
--- a/refactor/models/pl_gru.py
+++ b/refactor/models/pl_gru.py
@ -41,8 +41,7 @@ class RecurrentModel(pl.LightningModule):
        self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
        self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
        self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
-        # Language specific metrics - I am not really sure if they should be initialized
-        # independently or we can use the metrics init above... # TODO: check it
+        # Language specific metrics to compute metrics at epoch level
        self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
        self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
        self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
@ -110,7 +109,6 @@ class RecurrentModel(pl.LightningModule):
    def encode(self, lX, l_pad, batch_size=128):
        """
        Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512.
-        # TODO: does not run on gpu..
        :param lX:
        :param l_pad:
        :param batch_size:
@ -167,7 +165,6 @@ class RecurrentModel(pl.LightningModule):
    def training_epoch_end(self, outputs):
        # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
        # here we save epoch level metric values and compute them specifically for each language
-        # TODO: this is horrible...
        res_macroF1 = {lang: [] for lang in self.langs}
        res_microF1 = {lang: [] for lang in self.langs}
        res_macroK = {lang: [] for lang in self.langs}
--- a/refactor/requirements.txt
+++ b/refactor/requirements.txt
@ -0,0 +1,12 @@
+transformers==2.11.0
+pandas==0.25.3
+numpy==1.17.4
+joblib==0.14.0
+tqdm==4.50.2
+pytorch_lightning==1.1.2
+torch==1.3.1
+nltk==3.4.5
+scipy==1.3.3
+rdflib==4.2.2
+torchtext==0.4.0
+scikit_learn==0.24.1
--- a/refactor/util/pl_metrics.py
+++ b/refactor/util/pl_metrics.py
@ -102,10 +102,10 @@ class CustomK(Metric):
            specificity, recall = 0., 0.
            absolute_negatives = self.true_negative.sum() + self.false_positive.sum()
            if absolute_negatives != 0:
-                specificity = self.true_negative.sum()/absolute_negatives # Todo check if it is float
+                specificity = self.true_negative.sum()/absolute_negatives
            absolute_positives = self.true_positive.sum() + self.false_negative.sum()
            if absolute_positives != 0:
-                recall = self.true_positive.sum()/absolute_positives # Todo check if it is float
+                recall = self.true_positive.sum()/absolute_positives

            if absolute_positives == 0:
                return 2. * specificity - 1
@ -125,10 +125,10 @@ class CustomK(Metric):
                specificity, recall = 0., 0.
                absolute_negatives = class_tn + class_fp
                if absolute_negatives != 0:
-                    specificity = class_tn / absolute_negatives  # Todo check if it is float
+                    specificity = class_tn / absolute_negatives
                absolute_positives = class_tp + class_fn
                if absolute_positives != 0:
-                    recall = class_tp / absolute_positives  # Todo check if it is float
+                    recall = class_tp / absolute_positives

                if absolute_positives == 0:
                    class_specific.append(2. * specificity - 1)
--- a/refactor/view_generators.py
+++ b/refactor/view_generators.py
@ -1,18 +1,19 @@
 """
 This module contains the view generators that take care of computing the view specific document embeddings:

- VanillaFunGen (-X) cast document representations encoded via TFIDF into posterior probabilities by means of SVM.
+- VanillaFunGen (-x) cast document representations encoded via TFIDF into posterior probabilities by means of SVM.

- WordClassGen (-W): generates document representation via Word-Class-Embeddings.
+- WordClassGen (-w): generates document representation via Word-Class-Embeddings.
    Document embeddings are obtained via weighted sum of document's constituent embeddings.

- MuseGen (-M):
+- MuseGen (-m): generates document representation via MUSE embeddings.
+    Document embeddings are obtained via weighted sum of document's constituent embeddings.

- RecurrentGen (-G): generates document embedding by means of a Gated Recurrent Units. The model can be
+- RecurrentGen (-g): generates document embedding by means of a Gated Recurrent Units. The model can be
    initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
    Output dimension is (n_docs, 512).

- View generator (-B): generates document embedding via mBERT model.
+- View generator (-b): generates document embedding via mBERT model.
 """
 from abc import ABC, abstractmethod
 from models.learners import *
@ -153,9 +154,6 @@ class WordClassGen(ViewGen):


 class RecurrentGen(ViewGen):
-    # TODO: save model https://forums.pytorchlightning.ai/t/how-to-save-hparams-when-not-provided-as-argument-apparently-assigning-to-hparams-is-not-recomended/339/5
-    #  Problem: we are passing lPretrained to init the RecurrentModel -> incredible slow at saving (checkpoint).
-    #  if we do not save it is impossible to init RecurrentModel by calling RecurrentModel.load_from_checkpoint()
    def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50,
                 gpus=0, n_jobs=-1, stored_path=None):
        """