From 472b64ee0ebb588d037424fa2f086c2ce46e68dd Mon Sep 17 00:00:00 2001 From: andrea Date: Thu, 21 Jan 2021 15:41:56 +0100 Subject: [PATCH] Implemented metrics logging --- refactor/data/datamodule.py | 4 ++ refactor/main.py | 6 +-- refactor/models/pl_gru.py | 91 +++++++++++++++++++++++++++---------- refactor/util/common.py | 27 ++++++----- 4 files changed, 89 insertions(+), 39 deletions(-) diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index 621bee5..c87b0de 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -108,6 +108,7 @@ class RecurrentDataModule(pl.LightningDataModule): # Debug settings: reducing number of samples # l_train_index = {l: train[:50] for l, train in l_train_index.items()} # l_train_target = {l: target[:50] for l, target in l_train_target.items()} + self.training_dataset = RecurrentDataset(l_train_index, l_train_target, lPad_index=self.multilingualIndex.l_pad()) @@ -115,6 +116,7 @@ class RecurrentDataModule(pl.LightningDataModule): # Debug settings: reducing number of samples # l_val_index = {l: train[:50] for l, train in l_val_index.items()} # l_val_target = {l: target[:50] for l, target in l_val_target.items()} + self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) if stage == 'test' or stage is None: @@ -146,6 +148,7 @@ class BertDataModule(RecurrentDataModule): # Debug settings: reducing number of samples # l_train_raw = {l: train[:50] for l, train in l_train_raw.items()} # l_train_target = {l: target[:50] for l, target in l_train_target.items()} + l_train_index = self.tokenize(l_train_raw, max_len=self.max_len) self.training_dataset = RecurrentDataset(l_train_index, l_train_target, lPad_index=self.multilingualIndex.l_pad()) @@ -154,6 +157,7 @@ class BertDataModule(RecurrentDataModule): # Debug settings: reducing number of samples # l_val_raw = {l: train[:50] for l, train in l_val_raw.items()} # l_val_target = {l: target[:50] for l, target in l_val_target.items()} + l_val_index = self.tokenize(l_val_raw, max_len=self.max_len) self.val_dataset = RecurrentDataset(l_val_index, l_val_target, lPad_index=self.multilingualIndex.l_pad()) diff --git a/refactor/main.py b/refactor/main.py index 2c88f7d..a9840a1 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -15,7 +15,7 @@ def main(args): _DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings' data = MultilingualDataset.load(_DATASET) - data.set_view(languages=['it'], categories=[0, 1]) + # data.set_view(languages=['it', 'fr']) lX, ly = data.training() lXte, lyte = data.test() @@ -28,8 +28,8 @@ def main(args): # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) # gFun = WordClassGen(n_jobs=N_JOBS) - gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=True, batch_size=128, - nepochs=100, gpus=args.gpus, n_jobs=N_JOBS) + gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=128, + nepochs=50, gpus=args.gpus, n_jobs=N_JOBS) # gFun = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=N_JOBS) gFun.fit(lX, ly) diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py index 0fe5c6a..411e438 100644 --- a/refactor/models/pl_gru.py +++ b/refactor/models/pl_gru.py @@ -6,21 +6,25 @@ from torch.autograd import Variable from torch.optim.lr_scheduler import StepLR from transformers import AdamW import pytorch_lightning as pl -from pytorch_lightning.metrics import Accuracy from models.helpers import init_embeddings from util.pl_metrics import CustomF1, CustomK -from util.evaluation import evaluate - -# TODO: it should also be possible to compute metrics independently for each language! class RecurrentModel(pl.LightningModule): - """ - Check out for logging insight https://www.learnopencv.com/tensorboard-with-pytorch-lightning/ - """ - def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length, drop_embedding_range, drop_embedding_prop, gpus=None): + """ + + :param lPretrained: + :param langs: + :param output_size: + :param hidden_size: + :param lVocab_size: + :param learnable_length: + :param drop_embedding_range: + :param drop_embedding_prop: + :param gpus: + """ super().__init__() self.gpus = gpus self.langs = langs @@ -32,11 +36,16 @@ class RecurrentModel(pl.LightningModule): self.drop_embedding_prop = drop_embedding_prop self.loss = torch.nn.BCEWithLogitsLoss() - self.accuracy = Accuracy() self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) + # Language specific metrics - I am not really sure if they should be initialized + # independently or we can use the metrics init above... # TODO: check it + self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus) self.lPretrained_embeddings = nn.ModuleDict() self.lLearnable_embeddings = nn.ModuleDict() @@ -103,22 +112,60 @@ class RecurrentModel(pl.LightningModule): _ly = [] for lang in sorted(lX.keys()): _ly.append(ly[lang]) - ly = torch.cat(_ly, dim=0) - loss = self.loss(logits, ly) + y = torch.cat(_ly, dim=0) + loss = self.loss(logits, y) # Squashing logits through Sigmoid in order to get confidence score predictions = torch.sigmoid(logits) > 0.5 - accuracy = self.accuracy(predictions, ly) - microF1 = self.microF1(predictions, ly) - macroF1 = self.macroF1(predictions, ly) - microK = self.microK(predictions, ly) - macroK = self.macroK(predictions, ly) + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True) self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True) - return {'loss': loss} + re_lX = self._reconstruct_dict(predictions, ly) + return {'loss': loss, 'pred': re_lX, 'target': ly} + + def _reconstruct_dict(self, X, ly): + reconstructed = {} + _start = 0 + for lang in sorted(ly.keys()): + lang_batchsize = len(ly[lang]) + reconstructed[lang] = X[_start:_start+lang_batchsize] + _start += lang_batchsize + return reconstructed + + def training_epoch_end(self, outputs): + # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. + # here we save epoch level metric values and compute them specifically for each language + res_macroF1 = {lang: [] for lang in self.langs} + res_microF1 = {lang: [] for lang in self.langs} + res_macroK = {lang: [] for lang in self.langs} + res_microK = {lang: [] for lang in self.langs} + for output in outputs: + lX, ly = output['pred'], output['target'] + for lang in lX.keys(): + X, y = lX[lang], ly[lang] + lang_macroF1 = self.lang_macroF1(X, y) + lang_microF1 = self.lang_microF1(X, y) + lang_macroK = self.lang_macroK(X, y) + lang_microK = self.lang_microK(X, y) + + res_macroF1[lang].append(lang_macroF1) + res_microF1[lang].append(lang_microF1) + res_macroK[lang].append(lang_macroK) + res_microK[lang].append(lang_microK) + for lang in self.langs: + avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang])) + avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang])) + avg_macroK = torch.mean(torch.Tensor(res_macroK[lang])) + avg_microK = torch.mean(torch.Tensor(res_microK[lang])) + self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch) def validation_step(self, val_batch, batch_idx): lX, ly = val_batch @@ -129,13 +176,11 @@ class RecurrentModel(pl.LightningModule): ly = torch.cat(_ly, dim=0) loss = self.loss(logits, ly) predictions = torch.sigmoid(logits) > 0.5 - accuracy = self.accuracy(predictions, ly) microF1 = self.microF1(predictions, ly) macroF1 = self.macroF1(predictions, ly) microK = self.microK(predictions, ly) macroK = self.macroK(predictions, ly) self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('val-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) @@ -150,12 +195,10 @@ class RecurrentModel(pl.LightningModule): _ly.append(ly[lang]) ly = torch.cat(_ly, dim=0) predictions = torch.sigmoid(logits) > 0.5 - accuracy = self.accuracy(predictions, ly) microF1 = self.microF1(predictions, ly) macroF1 = self.macroF1(predictions, ly) - self.log('test-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=False) + self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=False) return def embed(self, X, lang): diff --git a/refactor/util/common.py b/refactor/util/common.py index d24707a..f5ec1a9 100644 --- a/refactor/util/common.py +++ b/refactor/util/common.py @@ -41,7 +41,7 @@ def _normalize(lX, l2=True): def none_dict(langs): - return {l:None for l in langs} + return {l: None for l in langs} class MultilingualIndex: @@ -62,12 +62,13 @@ class MultilingualIndex: for lang in self.langs: # Init monolingual Index - self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang], lang) + self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang], + lang) # call to index() function of monolingual Index self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang]) def train_val_split(self, val_prop=0.2, max_val=2000, seed=42): - for l,index in self.l_index.items(): + for l, index in self.l_index.items(): index.train_val_split(val_prop, max_val, seed=seed) def embedding_matrices(self, lpretrained, supervised): @@ -97,7 +98,7 @@ class MultilingualIndex: return wordlist def get_raw_lXtr(self): - lXtr_raw = {k:[] for k in self.langs} + lXtr_raw = {k: [] for k in self.langs} lYtr_raw = {k: [] for k in self.langs} for lang in self.langs: lXtr_raw[lang] = self.l_index[lang].train_raw @@ -137,10 +138,10 @@ class MultilingualIndex: return self.l_index[self.langs[0]].devel_target.shape[1] def l_vocabsize(self): - return {l:index.vocabsize for l,index in self.l_index.items()} + return {l: index.vocabsize for l, index in self.l_index.items()} def l_embeddings(self): - return {l:index.embedding_matrix for l,index in self.l_index.items()} + return {l: index.embedding_matrix for l, index in self.l_index.items()} def l_pad(self): return {l: index.pad_index for l, index in self.l_index.items()} @@ -227,8 +228,10 @@ class Index: # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available) self.out_of_vocabulary = dict() - self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) - self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) + self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, + self.out_of_vocabulary) + self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, + self.out_of_vocabulary) self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary) @@ -248,7 +251,8 @@ class Index: train_test_split( devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True) - print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') + print( + f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') def get_word_list(self): def extract_word_list(word2index): @@ -300,7 +304,7 @@ def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): are not in the original vocab but that are in the known_words :return: """ - indexes=[] + indexes = [] vocabsize = len(vocab) unk_count = 0 knw_count = 0 @@ -315,7 +319,7 @@ def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): else: if word in known_words: if word not in out_of_vocabulary: - out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary) + out_of_vocabulary[word] = vocabsize + len(out_of_vocabulary) idx = out_of_vocabulary[word] out_count += 1 else: @@ -335,4 +339,3 @@ def is_true(tensor, device): def is_false(tensor, device): return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) -