fixed view generators' transform method
This commit is contained in:
parent
bb84422d24
commit
e52b153ad4
23
main.py
23
main.py
|
@ -15,7 +15,7 @@ def main(args):
|
|||
print('Running generalized funnelling...')
|
||||
|
||||
data = MultilingualDataset.load(args.dataset)
|
||||
data.set_view(languages=['it', 'fr'])
|
||||
# data.set_view(languages=['it', 'da'])
|
||||
data.show_dimensions()
|
||||
lX, ly = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
@ -42,11 +42,14 @@ def main(args):
|
|||
|
||||
if args.gru_embedder:
|
||||
rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256,
|
||||
nepochs=args.nepochs_rnn, gpus=args.gpus, n_jobs=args.n_jobs)
|
||||
nepochs=args.nepochs_rnn, patience=args.patience_rnn, gpus=args.gpus,
|
||||
n_jobs=args.n_jobs)
|
||||
embedder_list.append(rnnEmbedder)
|
||||
|
||||
if args.bert_embedder:
|
||||
bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=args.nepochs_bert, gpus=args.gpus, n_jobs=args.n_jobs)
|
||||
bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=args.nepochs_bert, gpus=args.gpus,
|
||||
n_jobs=args.n_jobs)
|
||||
bertEmbedder.transform(lX)
|
||||
embedder_list.append(bertEmbedder)
|
||||
|
||||
# Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier
|
||||
|
@ -137,20 +140,24 @@ if __name__ == '__main__':
|
|||
default=False)
|
||||
|
||||
parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, metavar='',
|
||||
help='Number of parallel jobs (default is -1, all)',
|
||||
help='number of parallel jobs (default is -1, all)',
|
||||
default=-1)
|
||||
|
||||
parser.add_argument('--nepochs_rnn', dest='nepochs_rnn', type=int, metavar='',
|
||||
help='Number of max epochs to train Recurrent embedder (i.e., -g), default 150',
|
||||
help='number of max epochs to train Recurrent embedder (i.e., -g), default 150',
|
||||
default=150)
|
||||
|
||||
parser.add_argument('--nepochs_bert', dest='nepochs_bert', type=int, metavar='',
|
||||
help='Number of max epochs to train Bert model (i.e., -g), default 10',
|
||||
help='number of max epochs to train Bert model (i.e., -g), default 10',
|
||||
default=10)
|
||||
|
||||
parser.add_argument('--patience_rnn', dest='patience_rnn', type=int, metavar='',
|
||||
help='set early stop patience for the RecurrentGen, default 50',
|
||||
default=50)
|
||||
|
||||
parser.add_argument('--muse_dir', dest='muse_dir', type=str, metavar='',
|
||||
help='Path to the MUSE polylingual word embeddings (default ../embeddings)',
|
||||
default='../embeddings')
|
||||
help='Path to the MUSE polylingual word embeddings (default embeddings/)',
|
||||
default='embeddings/')
|
||||
|
||||
parser.add_argument('--gru_wce', dest='gru_wce', action='store_true',
|
||||
help='Deploy WCE embedding as embedding layer of the GRU View Generator',
|
||||
|
|
|
@ -37,11 +37,12 @@ optional arguments:
|
|||
-b, --bert_embedder deploy multilingual Bert to compute document embeddings
|
||||
-g, --gru_embedder deploy a GRU in order to compute document embeddings
|
||||
-c, --c_optimize optimize SVMs C hyperparameter
|
||||
-j, --n_jobs number of parallel jobs (default is -1, all)
|
||||
-j, --n_jobs number of parallel jobs, default is -1 i.e., all
|
||||
--nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150
|
||||
--nepochs_bert number of max epochs to train Bert model (i.e., -g), default 10
|
||||
--muse_dir path to the MUSE polylingual word embeddings (default ../embeddings)
|
||||
--gru_wce deploy WCE embedding as embedding layer of the GRU View Generator
|
||||
--patience_rnn set early stop patience for the RecurrentGen, default 50
|
||||
--gru_dir set the path to a pretrained GRU model (i.e., -g view generator)
|
||||
--bert_dir set the path to a pretrained mBERT model (i.e., -b view generator)
|
||||
--gpus specifies how many GPUs to use per node
|
||||
|
|
10
run.sh
10
run.sh
|
@ -1,6 +1,8 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
for i in {0..10..1}
|
||||
do
|
||||
python main.py --gpus 0
|
||||
done
|
||||
python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -g --gpus 0
|
||||
|
||||
#for i in {0..10..1}
|
||||
#do
|
||||
# python main.py --gpus 0
|
||||
#done
|
|
@ -26,10 +26,10 @@ from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize
|
|||
from src.models.learners import *
|
||||
from src.models.pl_bert import BertModel
|
||||
from src.models.pl_gru import RecurrentModel
|
||||
from src.util.common import TfidfVectorizerMultilingual, _normalize
|
||||
from src.util.common import TfidfVectorizerMultilingual, _normalize, index
|
||||
from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix
|
||||
from src.util.file import create_if_not_exist
|
||||
# TODO: add model checkpointing and loading from checkpoint + training on validation after convergence is reached
|
||||
# TODO: (1) add model checkpointing and loading from checkpoint + training on validation after convergence is reached
|
||||
|
||||
|
||||
class ViewGen(ABC):
|
||||
|
@ -203,7 +203,7 @@ class RecurrentGen(ViewGen):
|
|||
the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard.
|
||||
"""
|
||||
def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50,
|
||||
gpus=0, n_jobs=-1, patience=5, stored_path=None):
|
||||
gpus=0, n_jobs=-1, patience=20, stored_path=None):
|
||||
"""
|
||||
Init RecurrentGen.
|
||||
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
|
||||
|
@ -237,8 +237,7 @@ class RecurrentGen(ViewGen):
|
|||
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
|
||||
self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce)
|
||||
self.model = self._init_model()
|
||||
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False)
|
||||
# self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev')
|
||||
self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False)
|
||||
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
|
||||
patience=self.patience, verbose=False, mode='max')
|
||||
|
||||
|
@ -297,14 +296,19 @@ class RecurrentGen(ViewGen):
|
|||
:param lX: dict {lang: indexed documents}
|
||||
:return: documents projected to the common latent space.
|
||||
"""
|
||||
data = {}
|
||||
for lang in lX.keys():
|
||||
indexed = index(data=lX[lang],
|
||||
vocab=self.multilingualIndex.l_index[lang].word2index,
|
||||
known_words=set(self.multilingualIndex.l_index[lang].word2index.keys()),
|
||||
analyzer=self.multilingualIndex.l_vectorizer.get_analyzer(lang),
|
||||
unk_index=self.multilingualIndex.l_index[lang].unk_index,
|
||||
out_of_vocabulary=self.multilingualIndex.l_index[lang].out_of_vocabulary)
|
||||
data[lang] = indexed
|
||||
l_pad = self.multilingualIndex.l_pad()
|
||||
data = self.multilingualIndex.l_devel_index()
|
||||
self.model.to('cuda' if self.gpus else 'cpu')
|
||||
self.model.eval()
|
||||
# time_init = time.time()
|
||||
l_embeds = self.model.encode(data, l_pad, batch_size=256)
|
||||
# transform_time = round(time.time() - time_init, 3)
|
||||
# print(f'Executed! Transform took: {transform_time}')
|
||||
return l_embeds
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
|
@ -338,7 +342,7 @@ class BertGen(ViewGen):
|
|||
self.stored_path = stored_path
|
||||
self.model = self._init_model()
|
||||
self.patience = patience
|
||||
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False)
|
||||
self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False)
|
||||
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
|
||||
patience=self.patience, verbose=False, mode='max')
|
||||
|
||||
|
@ -371,14 +375,10 @@ class BertGen(ViewGen):
|
|||
:param lX: dict {lang: indexed documents}
|
||||
:return: documents projected to the common latent space.
|
||||
"""
|
||||
data = self.multilingualIndex.l_devel_raw_index()
|
||||
data = tokenize(data, max_len=512)
|
||||
data = tokenize(lX, max_len=512)
|
||||
self.model.to('cuda' if self.gpus else 'cpu')
|
||||
self.model.eval()
|
||||
# time_init = time.time()
|
||||
l_embeds = self.model.encode(data, batch_size=64)
|
||||
# transform_time = round(time.time() - time_init, 3)
|
||||
# print(f'Executed! Transform took: {transform_time}')
|
||||
return l_embeds
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
|
|
Loading…
Reference in New Issue