fixed view generators' transform method

This commit is contained in:
andrea 2021-01-28 18:12:20 +01:00
parent bb84422d24
commit e52b153ad4
4 changed files with 38 additions and 28 deletions

23
main.py
View File

@ -15,7 +15,7 @@ def main(args):
print('Running generalized funnelling...')
data = MultilingualDataset.load(args.dataset)
data.set_view(languages=['it', 'fr'])
# data.set_view(languages=['it', 'da'])
data.show_dimensions()
lX, ly = data.training()
lXte, lyte = data.test()
@ -42,11 +42,14 @@ def main(args):
if args.gru_embedder:
rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256,
nepochs=args.nepochs_rnn, gpus=args.gpus, n_jobs=args.n_jobs)
nepochs=args.nepochs_rnn, patience=args.patience_rnn, gpus=args.gpus,
n_jobs=args.n_jobs)
embedder_list.append(rnnEmbedder)
if args.bert_embedder:
bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=args.nepochs_bert, gpus=args.gpus, n_jobs=args.n_jobs)
bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=args.nepochs_bert, gpus=args.gpus,
n_jobs=args.n_jobs)
bertEmbedder.transform(lX)
embedder_list.append(bertEmbedder)
# Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier
@ -137,20 +140,24 @@ if __name__ == '__main__':
default=False)
parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, metavar='',
help='Number of parallel jobs (default is -1, all)',
help='number of parallel jobs (default is -1, all)',
default=-1)
parser.add_argument('--nepochs_rnn', dest='nepochs_rnn', type=int, metavar='',
help='Number of max epochs to train Recurrent embedder (i.e., -g), default 150',
help='number of max epochs to train Recurrent embedder (i.e., -g), default 150',
default=150)
parser.add_argument('--nepochs_bert', dest='nepochs_bert', type=int, metavar='',
help='Number of max epochs to train Bert model (i.e., -g), default 10',
help='number of max epochs to train Bert model (i.e., -g), default 10',
default=10)
parser.add_argument('--patience_rnn', dest='patience_rnn', type=int, metavar='',
help='set early stop patience for the RecurrentGen, default 50',
default=50)
parser.add_argument('--muse_dir', dest='muse_dir', type=str, metavar='',
help='Path to the MUSE polylingual word embeddings (default ../embeddings)',
default='../embeddings')
help='Path to the MUSE polylingual word embeddings (default embeddings/)',
default='embeddings/')
parser.add_argument('--gru_wce', dest='gru_wce', action='store_true',
help='Deploy WCE embedding as embedding layer of the GRU View Generator',

View File

@ -37,11 +37,12 @@ optional arguments:
-b, --bert_embedder deploy multilingual Bert to compute document embeddings
-g, --gru_embedder deploy a GRU in order to compute document embeddings
-c, --c_optimize optimize SVMs C hyperparameter
-j, --n_jobs number of parallel jobs (default is -1, all)
-j, --n_jobs number of parallel jobs, default is -1 i.e., all
--nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150
--nepochs_bert number of max epochs to train Bert model (i.e., -g), default 10
--muse_dir path to the MUSE polylingual word embeddings (default ../embeddings)
--gru_wce deploy WCE embedding as embedding layer of the GRU View Generator
--patience_rnn set early stop patience for the RecurrentGen, default 50
--gru_dir set the path to a pretrained GRU model (i.e., -g view generator)
--bert_dir set the path to a pretrained mBERT model (i.e., -b view generator)
--gpus specifies how many GPUs to use per node

10
run.sh
View File

@ -1,6 +1,8 @@
#!/usr/bin/env bash
for i in {0..10..1}
do
python main.py --gpus 0
done
python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -g --gpus 0
#for i in {0..10..1}
#do
# python main.py --gpus 0
#done

View File

@ -26,10 +26,10 @@ from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize
from src.models.learners import *
from src.models.pl_bert import BertModel
from src.models.pl_gru import RecurrentModel
from src.util.common import TfidfVectorizerMultilingual, _normalize
from src.util.common import TfidfVectorizerMultilingual, _normalize, index
from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix
from src.util.file import create_if_not_exist
# TODO: add model checkpointing and loading from checkpoint + training on validation after convergence is reached
# TODO: (1) add model checkpointing and loading from checkpoint + training on validation after convergence is reached
class ViewGen(ABC):
@ -203,7 +203,7 @@ class RecurrentGen(ViewGen):
the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard.
"""
def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50,
gpus=0, n_jobs=-1, patience=5, stored_path=None):
gpus=0, n_jobs=-1, patience=20, stored_path=None):
"""
Init RecurrentGen.
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
@ -237,8 +237,7 @@ class RecurrentGen(ViewGen):
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce)
self.model = self._init_model()
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False)
# self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev')
self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False)
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
patience=self.patience, verbose=False, mode='max')
@ -297,14 +296,19 @@ class RecurrentGen(ViewGen):
:param lX: dict {lang: indexed documents}
:return: documents projected to the common latent space.
"""
data = {}
for lang in lX.keys():
indexed = index(data=lX[lang],
vocab=self.multilingualIndex.l_index[lang].word2index,
known_words=set(self.multilingualIndex.l_index[lang].word2index.keys()),
analyzer=self.multilingualIndex.l_vectorizer.get_analyzer(lang),
unk_index=self.multilingualIndex.l_index[lang].unk_index,
out_of_vocabulary=self.multilingualIndex.l_index[lang].out_of_vocabulary)
data[lang] = indexed
l_pad = self.multilingualIndex.l_pad()
data = self.multilingualIndex.l_devel_index()
self.model.to('cuda' if self.gpus else 'cpu')
self.model.eval()
# time_init = time.time()
l_embeds = self.model.encode(data, l_pad, batch_size=256)
# transform_time = round(time.time() - time_init, 3)
# print(f'Executed! Transform took: {transform_time}')
return l_embeds
def fit_transform(self, lX, ly):
@ -338,7 +342,7 @@ class BertGen(ViewGen):
self.stored_path = stored_path
self.model = self._init_model()
self.patience = patience
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False)
self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False)
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
patience=self.patience, verbose=False, mode='max')
@ -371,14 +375,10 @@ class BertGen(ViewGen):
:param lX: dict {lang: indexed documents}
:return: documents projected to the common latent space.
"""
data = self.multilingualIndex.l_devel_raw_index()
data = tokenize(data, max_len=512)
data = tokenize(lX, max_len=512)
self.model.to('cuda' if self.gpus else 'cpu')
self.model.eval()
# time_init = time.time()
l_embeds = self.model.encode(data, batch_size=64)
# transform_time = round(time.time() - time_init, 3)
# print(f'Executed! Transform took: {transform_time}')
return l_embeds
def fit_transform(self, lX, ly):