Implemented metrics logging
This commit is contained in:
parent
5ce1203942
commit
472b64ee0e
|
|
@ -108,6 +108,7 @@ class RecurrentDataModule(pl.LightningDataModule):
|
||||||
# Debug settings: reducing number of samples
|
# Debug settings: reducing number of samples
|
||||||
# l_train_index = {l: train[:50] for l, train in l_train_index.items()}
|
# l_train_index = {l: train[:50] for l, train in l_train_index.items()}
|
||||||
# l_train_target = {l: target[:50] for l, target in l_train_target.items()}
|
# l_train_target = {l: target[:50] for l, target in l_train_target.items()}
|
||||||
|
|
||||||
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
|
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
|
||||||
lPad_index=self.multilingualIndex.l_pad())
|
lPad_index=self.multilingualIndex.l_pad())
|
||||||
|
|
||||||
|
|
@ -115,6 +116,7 @@ class RecurrentDataModule(pl.LightningDataModule):
|
||||||
# Debug settings: reducing number of samples
|
# Debug settings: reducing number of samples
|
||||||
# l_val_index = {l: train[:50] for l, train in l_val_index.items()}
|
# l_val_index = {l: train[:50] for l, train in l_val_index.items()}
|
||||||
# l_val_target = {l: target[:50] for l, target in l_val_target.items()}
|
# l_val_target = {l: target[:50] for l, target in l_val_target.items()}
|
||||||
|
|
||||||
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
|
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
|
||||||
lPad_index=self.multilingualIndex.l_pad())
|
lPad_index=self.multilingualIndex.l_pad())
|
||||||
if stage == 'test' or stage is None:
|
if stage == 'test' or stage is None:
|
||||||
|
|
@ -146,6 +148,7 @@ class BertDataModule(RecurrentDataModule):
|
||||||
# Debug settings: reducing number of samples
|
# Debug settings: reducing number of samples
|
||||||
# l_train_raw = {l: train[:50] for l, train in l_train_raw.items()}
|
# l_train_raw = {l: train[:50] for l, train in l_train_raw.items()}
|
||||||
# l_train_target = {l: target[:50] for l, target in l_train_target.items()}
|
# l_train_target = {l: target[:50] for l, target in l_train_target.items()}
|
||||||
|
|
||||||
l_train_index = self.tokenize(l_train_raw, max_len=self.max_len)
|
l_train_index = self.tokenize(l_train_raw, max_len=self.max_len)
|
||||||
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
|
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
|
||||||
lPad_index=self.multilingualIndex.l_pad())
|
lPad_index=self.multilingualIndex.l_pad())
|
||||||
|
|
@ -154,6 +157,7 @@ class BertDataModule(RecurrentDataModule):
|
||||||
# Debug settings: reducing number of samples
|
# Debug settings: reducing number of samples
|
||||||
# l_val_raw = {l: train[:50] for l, train in l_val_raw.items()}
|
# l_val_raw = {l: train[:50] for l, train in l_val_raw.items()}
|
||||||
# l_val_target = {l: target[:50] for l, target in l_val_target.items()}
|
# l_val_target = {l: target[:50] for l, target in l_val_target.items()}
|
||||||
|
|
||||||
l_val_index = self.tokenize(l_val_raw, max_len=self.max_len)
|
l_val_index = self.tokenize(l_val_raw, max_len=self.max_len)
|
||||||
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
|
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
|
||||||
lPad_index=self.multilingualIndex.l_pad())
|
lPad_index=self.multilingualIndex.l_pad())
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ def main(args):
|
||||||
_DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle'
|
_DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle'
|
||||||
EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings'
|
EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings'
|
||||||
data = MultilingualDataset.load(_DATASET)
|
data = MultilingualDataset.load(_DATASET)
|
||||||
data.set_view(languages=['it'], categories=[0, 1])
|
# data.set_view(languages=['it', 'fr'])
|
||||||
lX, ly = data.training()
|
lX, ly = data.training()
|
||||||
lXte, lyte = data.test()
|
lXte, lyte = data.test()
|
||||||
|
|
||||||
|
|
@ -28,8 +28,8 @@ def main(args):
|
||||||
# gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS)
|
# gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS)
|
||||||
# gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS)
|
# gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS)
|
||||||
# gFun = WordClassGen(n_jobs=N_JOBS)
|
# gFun = WordClassGen(n_jobs=N_JOBS)
|
||||||
gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=True, batch_size=128,
|
gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=128,
|
||||||
nepochs=100, gpus=args.gpus, n_jobs=N_JOBS)
|
nepochs=50, gpus=args.gpus, n_jobs=N_JOBS)
|
||||||
# gFun = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=N_JOBS)
|
# gFun = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=N_JOBS)
|
||||||
|
|
||||||
gFun.fit(lX, ly)
|
gFun.fit(lX, ly)
|
||||||
|
|
|
||||||
|
|
@ -6,21 +6,25 @@ from torch.autograd import Variable
|
||||||
from torch.optim.lr_scheduler import StepLR
|
from torch.optim.lr_scheduler import StepLR
|
||||||
from transformers import AdamW
|
from transformers import AdamW
|
||||||
import pytorch_lightning as pl
|
import pytorch_lightning as pl
|
||||||
from pytorch_lightning.metrics import Accuracy
|
|
||||||
from models.helpers import init_embeddings
|
from models.helpers import init_embeddings
|
||||||
from util.pl_metrics import CustomF1, CustomK
|
from util.pl_metrics import CustomF1, CustomK
|
||||||
from util.evaluation import evaluate
|
|
||||||
|
|
||||||
# TODO: it should also be possible to compute metrics independently for each language!
|
|
||||||
|
|
||||||
|
|
||||||
class RecurrentModel(pl.LightningModule):
|
class RecurrentModel(pl.LightningModule):
|
||||||
"""
|
|
||||||
Check out for logging insight https://www.learnopencv.com/tensorboard-with-pytorch-lightning/
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length,
|
def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length,
|
||||||
drop_embedding_range, drop_embedding_prop, gpus=None):
|
drop_embedding_range, drop_embedding_prop, gpus=None):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param lPretrained:
|
||||||
|
:param langs:
|
||||||
|
:param output_size:
|
||||||
|
:param hidden_size:
|
||||||
|
:param lVocab_size:
|
||||||
|
:param learnable_length:
|
||||||
|
:param drop_embedding_range:
|
||||||
|
:param drop_embedding_prop:
|
||||||
|
:param gpus:
|
||||||
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.gpus = gpus
|
self.gpus = gpus
|
||||||
self.langs = langs
|
self.langs = langs
|
||||||
|
|
@ -32,11 +36,16 @@ class RecurrentModel(pl.LightningModule):
|
||||||
self.drop_embedding_prop = drop_embedding_prop
|
self.drop_embedding_prop = drop_embedding_prop
|
||||||
self.loss = torch.nn.BCEWithLogitsLoss()
|
self.loss = torch.nn.BCEWithLogitsLoss()
|
||||||
|
|
||||||
self.accuracy = Accuracy()
|
|
||||||
self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||||
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||||
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
|
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
|
||||||
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
|
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
|
||||||
|
# Language specific metrics - I am not really sure if they should be initialized
|
||||||
|
# independently or we can use the metrics init above... # TODO: check it
|
||||||
|
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||||
|
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||||
|
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||||
|
self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||||
|
|
||||||
self.lPretrained_embeddings = nn.ModuleDict()
|
self.lPretrained_embeddings = nn.ModuleDict()
|
||||||
self.lLearnable_embeddings = nn.ModuleDict()
|
self.lLearnable_embeddings = nn.ModuleDict()
|
||||||
|
|
@ -103,22 +112,60 @@ class RecurrentModel(pl.LightningModule):
|
||||||
_ly = []
|
_ly = []
|
||||||
for lang in sorted(lX.keys()):
|
for lang in sorted(lX.keys()):
|
||||||
_ly.append(ly[lang])
|
_ly.append(ly[lang])
|
||||||
ly = torch.cat(_ly, dim=0)
|
y = torch.cat(_ly, dim=0)
|
||||||
loss = self.loss(logits, ly)
|
loss = self.loss(logits, y)
|
||||||
# Squashing logits through Sigmoid in order to get confidence score
|
# Squashing logits through Sigmoid in order to get confidence score
|
||||||
predictions = torch.sigmoid(logits) > 0.5
|
predictions = torch.sigmoid(logits) > 0.5
|
||||||
accuracy = self.accuracy(predictions, ly)
|
microF1 = self.microF1(predictions, y)
|
||||||
microF1 = self.microF1(predictions, ly)
|
macroF1 = self.macroF1(predictions, y)
|
||||||
macroF1 = self.macroF1(predictions, ly)
|
microK = self.microK(predictions, y)
|
||||||
microK = self.microK(predictions, ly)
|
macroK = self.macroK(predictions, y)
|
||||||
macroK = self.macroK(predictions, ly)
|
|
||||||
self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||||
self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
|
||||||
self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||||
self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||||
self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||||
self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||||
return {'loss': loss}
|
re_lX = self._reconstruct_dict(predictions, ly)
|
||||||
|
return {'loss': loss, 'pred': re_lX, 'target': ly}
|
||||||
|
|
||||||
|
def _reconstruct_dict(self, X, ly):
|
||||||
|
reconstructed = {}
|
||||||
|
_start = 0
|
||||||
|
for lang in sorted(ly.keys()):
|
||||||
|
lang_batchsize = len(ly[lang])
|
||||||
|
reconstructed[lang] = X[_start:_start+lang_batchsize]
|
||||||
|
_start += lang_batchsize
|
||||||
|
return reconstructed
|
||||||
|
|
||||||
|
def training_epoch_end(self, outputs):
|
||||||
|
# outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
|
||||||
|
# here we save epoch level metric values and compute them specifically for each language
|
||||||
|
res_macroF1 = {lang: [] for lang in self.langs}
|
||||||
|
res_microF1 = {lang: [] for lang in self.langs}
|
||||||
|
res_macroK = {lang: [] for lang in self.langs}
|
||||||
|
res_microK = {lang: [] for lang in self.langs}
|
||||||
|
for output in outputs:
|
||||||
|
lX, ly = output['pred'], output['target']
|
||||||
|
for lang in lX.keys():
|
||||||
|
X, y = lX[lang], ly[lang]
|
||||||
|
lang_macroF1 = self.lang_macroF1(X, y)
|
||||||
|
lang_microF1 = self.lang_microF1(X, y)
|
||||||
|
lang_macroK = self.lang_macroK(X, y)
|
||||||
|
lang_microK = self.lang_microK(X, y)
|
||||||
|
|
||||||
|
res_macroF1[lang].append(lang_macroF1)
|
||||||
|
res_microF1[lang].append(lang_microF1)
|
||||||
|
res_macroK[lang].append(lang_macroK)
|
||||||
|
res_microK[lang].append(lang_microK)
|
||||||
|
for lang in self.langs:
|
||||||
|
avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang]))
|
||||||
|
avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang]))
|
||||||
|
avg_macroK = torch.mean(torch.Tensor(res_macroK[lang]))
|
||||||
|
avg_microK = torch.mean(torch.Tensor(res_microK[lang]))
|
||||||
|
self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch)
|
||||||
|
self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch)
|
||||||
|
self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch)
|
||||||
|
self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch)
|
||||||
|
|
||||||
def validation_step(self, val_batch, batch_idx):
|
def validation_step(self, val_batch, batch_idx):
|
||||||
lX, ly = val_batch
|
lX, ly = val_batch
|
||||||
|
|
@ -129,13 +176,11 @@ class RecurrentModel(pl.LightningModule):
|
||||||
ly = torch.cat(_ly, dim=0)
|
ly = torch.cat(_ly, dim=0)
|
||||||
loss = self.loss(logits, ly)
|
loss = self.loss(logits, ly)
|
||||||
predictions = torch.sigmoid(logits) > 0.5
|
predictions = torch.sigmoid(logits) > 0.5
|
||||||
accuracy = self.accuracy(predictions, ly)
|
|
||||||
microF1 = self.microF1(predictions, ly)
|
microF1 = self.microF1(predictions, ly)
|
||||||
macroF1 = self.macroF1(predictions, ly)
|
macroF1 = self.macroF1(predictions, ly)
|
||||||
microK = self.microK(predictions, ly)
|
microK = self.microK(predictions, ly)
|
||||||
macroK = self.macroK(predictions, ly)
|
macroK = self.macroK(predictions, ly)
|
||||||
self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||||
self.log('val-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
|
||||||
self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||||
self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||||
self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||||
|
|
@ -150,12 +195,10 @@ class RecurrentModel(pl.LightningModule):
|
||||||
_ly.append(ly[lang])
|
_ly.append(ly[lang])
|
||||||
ly = torch.cat(_ly, dim=0)
|
ly = torch.cat(_ly, dim=0)
|
||||||
predictions = torch.sigmoid(logits) > 0.5
|
predictions = torch.sigmoid(logits) > 0.5
|
||||||
accuracy = self.accuracy(predictions, ly)
|
|
||||||
microF1 = self.microF1(predictions, ly)
|
microF1 = self.microF1(predictions, ly)
|
||||||
macroF1 = self.macroF1(predictions, ly)
|
macroF1 = self.macroF1(predictions, ly)
|
||||||
self.log('test-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=False)
|
||||||
self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=False)
|
||||||
self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def embed(self, X, lang):
|
def embed(self, X, lang):
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,7 @@ def _normalize(lX, l2=True):
|
||||||
|
|
||||||
|
|
||||||
def none_dict(langs):
|
def none_dict(langs):
|
||||||
return {l:None for l in langs}
|
return {l: None for l in langs}
|
||||||
|
|
||||||
|
|
||||||
class MultilingualIndex:
|
class MultilingualIndex:
|
||||||
|
|
@ -62,12 +62,13 @@ class MultilingualIndex:
|
||||||
|
|
||||||
for lang in self.langs:
|
for lang in self.langs:
|
||||||
# Init monolingual Index
|
# Init monolingual Index
|
||||||
self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang], lang)
|
self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang],
|
||||||
|
lang)
|
||||||
# call to index() function of monolingual Index
|
# call to index() function of monolingual Index
|
||||||
self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang])
|
self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang])
|
||||||
|
|
||||||
def train_val_split(self, val_prop=0.2, max_val=2000, seed=42):
|
def train_val_split(self, val_prop=0.2, max_val=2000, seed=42):
|
||||||
for l,index in self.l_index.items():
|
for l, index in self.l_index.items():
|
||||||
index.train_val_split(val_prop, max_val, seed=seed)
|
index.train_val_split(val_prop, max_val, seed=seed)
|
||||||
|
|
||||||
def embedding_matrices(self, lpretrained, supervised):
|
def embedding_matrices(self, lpretrained, supervised):
|
||||||
|
|
@ -97,7 +98,7 @@ class MultilingualIndex:
|
||||||
return wordlist
|
return wordlist
|
||||||
|
|
||||||
def get_raw_lXtr(self):
|
def get_raw_lXtr(self):
|
||||||
lXtr_raw = {k:[] for k in self.langs}
|
lXtr_raw = {k: [] for k in self.langs}
|
||||||
lYtr_raw = {k: [] for k in self.langs}
|
lYtr_raw = {k: [] for k in self.langs}
|
||||||
for lang in self.langs:
|
for lang in self.langs:
|
||||||
lXtr_raw[lang] = self.l_index[lang].train_raw
|
lXtr_raw[lang] = self.l_index[lang].train_raw
|
||||||
|
|
@ -137,10 +138,10 @@ class MultilingualIndex:
|
||||||
return self.l_index[self.langs[0]].devel_target.shape[1]
|
return self.l_index[self.langs[0]].devel_target.shape[1]
|
||||||
|
|
||||||
def l_vocabsize(self):
|
def l_vocabsize(self):
|
||||||
return {l:index.vocabsize for l,index in self.l_index.items()}
|
return {l: index.vocabsize for l, index in self.l_index.items()}
|
||||||
|
|
||||||
def l_embeddings(self):
|
def l_embeddings(self):
|
||||||
return {l:index.embedding_matrix for l,index in self.l_index.items()}
|
return {l: index.embedding_matrix for l, index in self.l_index.items()}
|
||||||
|
|
||||||
def l_pad(self):
|
def l_pad(self):
|
||||||
return {l: index.pad_index for l, index in self.l_index.items()}
|
return {l: index.pad_index for l, index in self.l_index.items()}
|
||||||
|
|
@ -227,8 +228,10 @@ class Index:
|
||||||
|
|
||||||
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
|
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
|
||||||
self.out_of_vocabulary = dict()
|
self.out_of_vocabulary = dict()
|
||||||
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
|
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index,
|
||||||
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
|
self.out_of_vocabulary)
|
||||||
|
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index,
|
||||||
|
self.out_of_vocabulary)
|
||||||
|
|
||||||
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
|
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
|
||||||
|
|
||||||
|
|
@ -248,7 +251,8 @@ class Index:
|
||||||
train_test_split(
|
train_test_split(
|
||||||
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True)
|
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True)
|
||||||
|
|
||||||
print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
|
print(
|
||||||
|
f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
|
||||||
|
|
||||||
def get_word_list(self):
|
def get_word_list(self):
|
||||||
def extract_word_list(word2index):
|
def extract_word_list(word2index):
|
||||||
|
|
@ -300,7 +304,7 @@ def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
|
||||||
are not in the original vocab but that are in the known_words
|
are not in the original vocab but that are in the known_words
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
indexes=[]
|
indexes = []
|
||||||
vocabsize = len(vocab)
|
vocabsize = len(vocab)
|
||||||
unk_count = 0
|
unk_count = 0
|
||||||
knw_count = 0
|
knw_count = 0
|
||||||
|
|
@ -315,7 +319,7 @@ def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
|
||||||
else:
|
else:
|
||||||
if word in known_words:
|
if word in known_words:
|
||||||
if word not in out_of_vocabulary:
|
if word not in out_of_vocabulary:
|
||||||
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
|
out_of_vocabulary[word] = vocabsize + len(out_of_vocabulary)
|
||||||
idx = out_of_vocabulary[word]
|
idx = out_of_vocabulary[word]
|
||||||
out_count += 1
|
out_count += 1
|
||||||
else:
|
else:
|
||||||
|
|
@ -335,4 +339,3 @@ def is_true(tensor, device):
|
||||||
|
|
||||||
def is_false(tensor, device):
|
def is_false(tensor, device):
|
||||||
return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device))
|
return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device))
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue