fixed view generators' transform method
This commit is contained in:
parent
bb84422d24
commit
e52b153ad4
23
main.py
23
main.py
|
@ -15,7 +15,7 @@ def main(args):
|
||||||
print('Running generalized funnelling...')
|
print('Running generalized funnelling...')
|
||||||
|
|
||||||
data = MultilingualDataset.load(args.dataset)
|
data = MultilingualDataset.load(args.dataset)
|
||||||
data.set_view(languages=['it', 'fr'])
|
# data.set_view(languages=['it', 'da'])
|
||||||
data.show_dimensions()
|
data.show_dimensions()
|
||||||
lX, ly = data.training()
|
lX, ly = data.training()
|
||||||
lXte, lyte = data.test()
|
lXte, lyte = data.test()
|
||||||
|
@ -42,11 +42,14 @@ def main(args):
|
||||||
|
|
||||||
if args.gru_embedder:
|
if args.gru_embedder:
|
||||||
rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256,
|
rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256,
|
||||||
nepochs=args.nepochs_rnn, gpus=args.gpus, n_jobs=args.n_jobs)
|
nepochs=args.nepochs_rnn, patience=args.patience_rnn, gpus=args.gpus,
|
||||||
|
n_jobs=args.n_jobs)
|
||||||
embedder_list.append(rnnEmbedder)
|
embedder_list.append(rnnEmbedder)
|
||||||
|
|
||||||
if args.bert_embedder:
|
if args.bert_embedder:
|
||||||
bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=args.nepochs_bert, gpus=args.gpus, n_jobs=args.n_jobs)
|
bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=args.nepochs_bert, gpus=args.gpus,
|
||||||
|
n_jobs=args.n_jobs)
|
||||||
|
bertEmbedder.transform(lX)
|
||||||
embedder_list.append(bertEmbedder)
|
embedder_list.append(bertEmbedder)
|
||||||
|
|
||||||
# Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier
|
# Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier
|
||||||
|
@ -137,20 +140,24 @@ if __name__ == '__main__':
|
||||||
default=False)
|
default=False)
|
||||||
|
|
||||||
parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, metavar='',
|
parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, metavar='',
|
||||||
help='Number of parallel jobs (default is -1, all)',
|
help='number of parallel jobs (default is -1, all)',
|
||||||
default=-1)
|
default=-1)
|
||||||
|
|
||||||
parser.add_argument('--nepochs_rnn', dest='nepochs_rnn', type=int, metavar='',
|
parser.add_argument('--nepochs_rnn', dest='nepochs_rnn', type=int, metavar='',
|
||||||
help='Number of max epochs to train Recurrent embedder (i.e., -g), default 150',
|
help='number of max epochs to train Recurrent embedder (i.e., -g), default 150',
|
||||||
default=150)
|
default=150)
|
||||||
|
|
||||||
parser.add_argument('--nepochs_bert', dest='nepochs_bert', type=int, metavar='',
|
parser.add_argument('--nepochs_bert', dest='nepochs_bert', type=int, metavar='',
|
||||||
help='Number of max epochs to train Bert model (i.e., -g), default 10',
|
help='number of max epochs to train Bert model (i.e., -g), default 10',
|
||||||
default=10)
|
default=10)
|
||||||
|
|
||||||
|
parser.add_argument('--patience_rnn', dest='patience_rnn', type=int, metavar='',
|
||||||
|
help='set early stop patience for the RecurrentGen, default 50',
|
||||||
|
default=50)
|
||||||
|
|
||||||
parser.add_argument('--muse_dir', dest='muse_dir', type=str, metavar='',
|
parser.add_argument('--muse_dir', dest='muse_dir', type=str, metavar='',
|
||||||
help='Path to the MUSE polylingual word embeddings (default ../embeddings)',
|
help='Path to the MUSE polylingual word embeddings (default embeddings/)',
|
||||||
default='../embeddings')
|
default='embeddings/')
|
||||||
|
|
||||||
parser.add_argument('--gru_wce', dest='gru_wce', action='store_true',
|
parser.add_argument('--gru_wce', dest='gru_wce', action='store_true',
|
||||||
help='Deploy WCE embedding as embedding layer of the GRU View Generator',
|
help='Deploy WCE embedding as embedding layer of the GRU View Generator',
|
||||||
|
|
|
@ -37,11 +37,12 @@ optional arguments:
|
||||||
-b, --bert_embedder deploy multilingual Bert to compute document embeddings
|
-b, --bert_embedder deploy multilingual Bert to compute document embeddings
|
||||||
-g, --gru_embedder deploy a GRU in order to compute document embeddings
|
-g, --gru_embedder deploy a GRU in order to compute document embeddings
|
||||||
-c, --c_optimize optimize SVMs C hyperparameter
|
-c, --c_optimize optimize SVMs C hyperparameter
|
||||||
-j, --n_jobs number of parallel jobs (default is -1, all)
|
-j, --n_jobs number of parallel jobs, default is -1 i.e., all
|
||||||
--nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150
|
--nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150
|
||||||
--nepochs_bert number of max epochs to train Bert model (i.e., -g), default 10
|
--nepochs_bert number of max epochs to train Bert model (i.e., -g), default 10
|
||||||
--muse_dir path to the MUSE polylingual word embeddings (default ../embeddings)
|
--muse_dir path to the MUSE polylingual word embeddings (default ../embeddings)
|
||||||
--gru_wce deploy WCE embedding as embedding layer of the GRU View Generator
|
--gru_wce deploy WCE embedding as embedding layer of the GRU View Generator
|
||||||
|
--patience_rnn set early stop patience for the RecurrentGen, default 50
|
||||||
--gru_dir set the path to a pretrained GRU model (i.e., -g view generator)
|
--gru_dir set the path to a pretrained GRU model (i.e., -g view generator)
|
||||||
--bert_dir set the path to a pretrained mBERT model (i.e., -b view generator)
|
--bert_dir set the path to a pretrained mBERT model (i.e., -b view generator)
|
||||||
--gpus specifies how many GPUs to use per node
|
--gpus specifies how many GPUs to use per node
|
||||||
|
|
10
run.sh
10
run.sh
|
@ -1,6 +1,8 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
for i in {0..10..1}
|
python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -g --gpus 0
|
||||||
do
|
|
||||||
python main.py --gpus 0
|
#for i in {0..10..1}
|
||||||
done
|
#do
|
||||||
|
# python main.py --gpus 0
|
||||||
|
#done
|
|
@ -26,10 +26,10 @@ from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize
|
||||||
from src.models.learners import *
|
from src.models.learners import *
|
||||||
from src.models.pl_bert import BertModel
|
from src.models.pl_bert import BertModel
|
||||||
from src.models.pl_gru import RecurrentModel
|
from src.models.pl_gru import RecurrentModel
|
||||||
from src.util.common import TfidfVectorizerMultilingual, _normalize
|
from src.util.common import TfidfVectorizerMultilingual, _normalize, index
|
||||||
from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix
|
from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix
|
||||||
from src.util.file import create_if_not_exist
|
from src.util.file import create_if_not_exist
|
||||||
# TODO: add model checkpointing and loading from checkpoint + training on validation after convergence is reached
|
# TODO: (1) add model checkpointing and loading from checkpoint + training on validation after convergence is reached
|
||||||
|
|
||||||
|
|
||||||
class ViewGen(ABC):
|
class ViewGen(ABC):
|
||||||
|
@ -203,7 +203,7 @@ class RecurrentGen(ViewGen):
|
||||||
the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard.
|
the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard.
|
||||||
"""
|
"""
|
||||||
def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50,
|
def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50,
|
||||||
gpus=0, n_jobs=-1, patience=5, stored_path=None):
|
gpus=0, n_jobs=-1, patience=20, stored_path=None):
|
||||||
"""
|
"""
|
||||||
Init RecurrentGen.
|
Init RecurrentGen.
|
||||||
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
|
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
|
||||||
|
@ -237,8 +237,7 @@ class RecurrentGen(ViewGen):
|
||||||
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
|
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
|
||||||
self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce)
|
self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce)
|
||||||
self.model = self._init_model()
|
self.model = self._init_model()
|
||||||
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False)
|
self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False)
|
||||||
# self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev')
|
|
||||||
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
|
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
|
||||||
patience=self.patience, verbose=False, mode='max')
|
patience=self.patience, verbose=False, mode='max')
|
||||||
|
|
||||||
|
@ -297,14 +296,19 @@ class RecurrentGen(ViewGen):
|
||||||
:param lX: dict {lang: indexed documents}
|
:param lX: dict {lang: indexed documents}
|
||||||
:return: documents projected to the common latent space.
|
:return: documents projected to the common latent space.
|
||||||
"""
|
"""
|
||||||
|
data = {}
|
||||||
|
for lang in lX.keys():
|
||||||
|
indexed = index(data=lX[lang],
|
||||||
|
vocab=self.multilingualIndex.l_index[lang].word2index,
|
||||||
|
known_words=set(self.multilingualIndex.l_index[lang].word2index.keys()),
|
||||||
|
analyzer=self.multilingualIndex.l_vectorizer.get_analyzer(lang),
|
||||||
|
unk_index=self.multilingualIndex.l_index[lang].unk_index,
|
||||||
|
out_of_vocabulary=self.multilingualIndex.l_index[lang].out_of_vocabulary)
|
||||||
|
data[lang] = indexed
|
||||||
l_pad = self.multilingualIndex.l_pad()
|
l_pad = self.multilingualIndex.l_pad()
|
||||||
data = self.multilingualIndex.l_devel_index()
|
|
||||||
self.model.to('cuda' if self.gpus else 'cpu')
|
self.model.to('cuda' if self.gpus else 'cpu')
|
||||||
self.model.eval()
|
self.model.eval()
|
||||||
# time_init = time.time()
|
|
||||||
l_embeds = self.model.encode(data, l_pad, batch_size=256)
|
l_embeds = self.model.encode(data, l_pad, batch_size=256)
|
||||||
# transform_time = round(time.time() - time_init, 3)
|
|
||||||
# print(f'Executed! Transform took: {transform_time}')
|
|
||||||
return l_embeds
|
return l_embeds
|
||||||
|
|
||||||
def fit_transform(self, lX, ly):
|
def fit_transform(self, lX, ly):
|
||||||
|
@ -338,7 +342,7 @@ class BertGen(ViewGen):
|
||||||
self.stored_path = stored_path
|
self.stored_path = stored_path
|
||||||
self.model = self._init_model()
|
self.model = self._init_model()
|
||||||
self.patience = patience
|
self.patience = patience
|
||||||
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False)
|
self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False)
|
||||||
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
|
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
|
||||||
patience=self.patience, verbose=False, mode='max')
|
patience=self.patience, verbose=False, mode='max')
|
||||||
|
|
||||||
|
@ -371,14 +375,10 @@ class BertGen(ViewGen):
|
||||||
:param lX: dict {lang: indexed documents}
|
:param lX: dict {lang: indexed documents}
|
||||||
:return: documents projected to the common latent space.
|
:return: documents projected to the common latent space.
|
||||||
"""
|
"""
|
||||||
data = self.multilingualIndex.l_devel_raw_index()
|
data = tokenize(lX, max_len=512)
|
||||||
data = tokenize(data, max_len=512)
|
|
||||||
self.model.to('cuda' if self.gpus else 'cpu')
|
self.model.to('cuda' if self.gpus else 'cpu')
|
||||||
self.model.eval()
|
self.model.eval()
|
||||||
# time_init = time.time()
|
|
||||||
l_embeds = self.model.encode(data, batch_size=64)
|
l_embeds = self.model.encode(data, batch_size=64)
|
||||||
# transform_time = round(time.time() - time_init, 3)
|
|
||||||
# print(f'Executed! Transform took: {transform_time}')
|
|
||||||
return l_embeds
|
return l_embeds
|
||||||
|
|
||||||
def fit_transform(self, lX, ly):
|
def fit_transform(self, lX, ly):
|
||||||
|
|
Loading…
Reference in New Issue