typos + requirements.txt

This commit is contained in:
andrea 2021-01-26 12:49:28 +01:00
parent 90e974f0a3
commit 5958df3e3c
6 changed files with 24 additions and 20 deletions

View File

@ -46,7 +46,6 @@ def main(args):
if args.bert_embedder:
bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs)
bertEmbedder.transform(lX)
embedder_list.append(bertEmbedder)
# Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier

View File

@ -22,8 +22,7 @@ class BertModel(pl.LightningModule):
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
# Language specific metrics - I am not really sure if they should be initialized
# independently or we can use the metrics init above... # TODO: check it
# Language specific metrics to compute metrics at epoch level
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
@ -71,7 +70,6 @@ class BertModel(pl.LightningModule):
langs = set(langs)
# outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
# here we save epoch level metric values and compute them specifically for each language
# TODO: make this a function (reused in pl_gru epoch_end)
res_macroF1 = {lang: [] for lang in langs}
res_microF1 = {lang: [] for lang in langs}
res_macroK = {lang: [] for lang in langs}

View File

@ -41,8 +41,7 @@ class RecurrentModel(pl.LightningModule):
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
# Language specific metrics - I am not really sure if they should be initialized
# independently or we can use the metrics init above... # TODO: check it
# Language specific metrics to compute metrics at epoch level
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
@ -110,7 +109,6 @@ class RecurrentModel(pl.LightningModule):
def encode(self, lX, l_pad, batch_size=128):
"""
Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512.
# TODO: does not run on gpu..
:param lX:
:param l_pad:
:param batch_size:
@ -167,7 +165,6 @@ class RecurrentModel(pl.LightningModule):
def training_epoch_end(self, outputs):
# outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
# here we save epoch level metric values and compute them specifically for each language
# TODO: this is horrible...
res_macroF1 = {lang: [] for lang in self.langs}
res_microF1 = {lang: [] for lang in self.langs}
res_macroK = {lang: [] for lang in self.langs}

12
refactor/requirements.txt Normal file
View File

@ -0,0 +1,12 @@
transformers==2.11.0
pandas==0.25.3
numpy==1.17.4
joblib==0.14.0
tqdm==4.50.2
pytorch_lightning==1.1.2
torch==1.3.1
nltk==3.4.5
scipy==1.3.3
rdflib==4.2.2
torchtext==0.4.0
scikit_learn==0.24.1

View File

@ -102,10 +102,10 @@ class CustomK(Metric):
specificity, recall = 0., 0.
absolute_negatives = self.true_negative.sum() + self.false_positive.sum()
if absolute_negatives != 0:
specificity = self.true_negative.sum()/absolute_negatives # Todo check if it is float
specificity = self.true_negative.sum()/absolute_negatives
absolute_positives = self.true_positive.sum() + self.false_negative.sum()
if absolute_positives != 0:
recall = self.true_positive.sum()/absolute_positives # Todo check if it is float
recall = self.true_positive.sum()/absolute_positives
if absolute_positives == 0:
return 2. * specificity - 1
@ -125,10 +125,10 @@ class CustomK(Metric):
specificity, recall = 0., 0.
absolute_negatives = class_tn + class_fp
if absolute_negatives != 0:
specificity = class_tn / absolute_negatives # Todo check if it is float
specificity = class_tn / absolute_negatives
absolute_positives = class_tp + class_fn
if absolute_positives != 0:
recall = class_tp / absolute_positives # Todo check if it is float
recall = class_tp / absolute_positives
if absolute_positives == 0:
class_specific.append(2. * specificity - 1)

View File

@ -1,18 +1,19 @@
"""
This module contains the view generators that take care of computing the view specific document embeddings:
- VanillaFunGen (-X) cast document representations encoded via TFIDF into posterior probabilities by means of SVM.
- VanillaFunGen (-x) cast document representations encoded via TFIDF into posterior probabilities by means of SVM.
- WordClassGen (-W): generates document representation via Word-Class-Embeddings.
- WordClassGen (-w): generates document representation via Word-Class-Embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings.
- MuseGen (-M):
- MuseGen (-m): generates document representation via MUSE embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings.
- RecurrentGen (-G): generates document embedding by means of a Gated Recurrent Units. The model can be
- RecurrentGen (-g): generates document embedding by means of a Gated Recurrent Units. The model can be
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
Output dimension is (n_docs, 512).
- View generator (-B): generates document embedding via mBERT model.
- View generator (-b): generates document embedding via mBERT model.
"""
from abc import ABC, abstractmethod
from models.learners import *
@ -153,9 +154,6 @@ class WordClassGen(ViewGen):
class RecurrentGen(ViewGen):
# TODO: save model https://forums.pytorchlightning.ai/t/how-to-save-hparams-when-not-provided-as-argument-apparently-assigning-to-hparams-is-not-recomended/339/5
# Problem: we are passing lPretrained to init the RecurrentModel -> incredible slow at saving (checkpoint).
# if we do not save it is impossible to init RecurrentModel by calling RecurrentModel.load_from_checkpoint()
def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50,
gpus=0, n_jobs=-1, stored_path=None):
"""