Compare commits

..

23 Commits
master ... rsc

Author SHA1 Message Date
andrea 0fdb39532c fixed funnelling transform function. Now it averages lZparts accoring to the actual number of embedders used for a given language (e.g., 'da' -> -x -m -w -b ->'da' will be averaged by 4, 'en' -> -m, -b -> 'en' will be averaged by 2) 2021-02-12 16:15:38 +01:00
andrea 4cbef64e28 running comparison 2021-02-11 18:31:59 +01:00
andrea d5417691d5 running comparison 2021-02-11 17:29:29 +01:00
andrea 7c8de936db running comparison 2021-02-11 12:11:04 +01:00
andrea 421d7660f6 running comparison 2021-02-11 10:51:59 +01:00
andrea 612e90a584 TODO: early stop is triggered by current_score == best_score ! 2021-02-09 17:59:07 +01:00
andrea 7b6938459f implemented BertDataModule collate function 2021-02-09 09:43:19 +01:00
andrea f579a1a7f2 implemented BertDataModule collate function 2021-02-08 16:37:02 +01:00
andrea b2be446446 fixed lr 2021-02-08 09:02:16 +01:00
andrea 80d0693cb1 removed unused lstm_class.py module 2021-02-05 11:55:19 +01:00
andrea ec6886dbab fixed bug in common 'l_test_raw_index_zero_shot' 2021-02-05 10:54:04 +01:00
andrea a6be7857a3 implemented zero-shot experiment code for VanillaFunGen and WordClassGen 2021-02-04 16:50:09 +01:00
andrea 495a0b6af9 implemented zero-shot experiment code for VanillaFunGen and WordClassGen 2021-02-04 13:00:18 +01:00
andrea f3fafd0f00 implemented zero-shot experiment code for VanillaFunGen and WordClassGen 2021-02-04 12:44:36 +01:00
andrea 8968570d82 implemented zero-shot experiment code for VanillaFunGen and WordClassGen 2021-02-04 12:24:57 +01:00
andrea 7affa1fab4 implemented zero-shot experiment code for VanillaFunGen and WordClassGen 2021-02-04 11:43:47 +01:00
andrea c65c91fc27 setting up zero-shot experiments (implemented for Recurrent and Bert but not tested for Bert) 2021-02-03 12:30:08 +01:00
andrea ab3bacb29c setting up zero-shot experiments (implemented for Recurrent and Bert but not tested for Bert) 2021-02-02 17:13:31 +01:00
andrea 6361a4eba0 setting up zero-shot experiments (implemented for Recurrent and Bert but not tested) 2021-02-02 16:12:08 +01:00
andrea ee98c5f610 setting up zero-shot experiments (implemented for Recurrent and Bert but not tested) 2021-02-02 16:09:49 +01:00
andrea 5821325c86 setting up zero-shot experiments (done and tested for WordClassGen) 2021-02-02 15:15:23 +01:00
andrea 7f493da0f8 setting up zero-shot experiments (done and tested for MuseGen) 2021-02-02 12:57:27 +01:00
andrea 10bed81916 Set arguments in order to reproduce 'master' performances with Neural setting 2021-02-02 11:23:55 +01:00
13 changed files with 417 additions and 152 deletions

78
main.py
View File

@ -7,19 +7,31 @@ from src.util.evaluation import evaluate
from src.util.results_csv import CSVlog
from src.view_generators import *
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
def main(args):
assert args.post_embedder or args.muse_embedder or args.wce_embedder or args.gru_embedder or args.bert_embedder, \
'empty set of document embeddings is not allowed!'
assert not (args.zero_shot and (args.zscl_langs is None)), \
'--zscl_langs cannot be empty when setting --zero_shot to True'
print('Running generalized funnelling...')
data = MultilingualDataset.load(args.dataset)
# data.set_view(languages=['it', 'da'])
# data.set_view(languages=['da', 'nl', 'it'])
data.show_dimensions()
lX, ly = data.training()
lXte, lyte = data.test()
# TODO: debug settings
# print(f'\n[Running on DEBUG mode - samples per language are reduced to 5 max!]\n')
# lX = {k: v[:5] for k, v in lX.items()}
# ly = {k: v[:5] for k, v in ly.items()}
# lXte = {k: v[:5] for k, v in lXte.items()}
# lyte = {k: v[:5] for k, v in lyte.items()}
# Init multilingualIndex - mandatory when deploying Neural View Generators...
if args.gru_embedder or args.bert_embedder:
multilingualIndex = MultilingualIndex()
@ -29,36 +41,65 @@ def main(args):
# Init ViewGenerators and append them to embedder_list
embedder_list = []
if args.post_embedder:
posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs)
posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True),
zero_shot=args.zero_shot,
train_langs=args.zscl_langs,
n_jobs=args.n_jobs)
embedder_list.append(posteriorEmbedder)
if args.muse_embedder:
museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs)
museEmbedder = MuseGen(muse_dir=args.muse_dir,
zero_shot=args.zero_shot,
train_langs=args.zscl_langs,
n_jobs=args.n_jobs)
embedder_list.append(museEmbedder)
if args.wce_embedder:
wceEmbedder = WordClassGen(n_jobs=args.n_jobs)
wceEmbedder = WordClassGen(zero_shot=args.zero_shot,
train_langs=args.zscl_langs,
n_jobs=args.n_jobs)
embedder_list.append(wceEmbedder)
if args.gru_embedder:
rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.rnn_wce,
batch_size=args.batch_rnn, nepochs=args.nepochs_rnn, patience=args.patience_rnn,
gpus=args.gpus, n_jobs=args.n_jobs)
rnnEmbedder = RecurrentGen(multilingualIndex,
pretrained_embeddings=lMuse,
wce=args.rnn_wce,
batch_size=args.batch_rnn,
nepochs=args.nepochs_rnn,
patience=args.patience_rnn,
zero_shot=args.zero_shot,
train_langs=args.zscl_langs,
gpus=args.gpus,
n_jobs=args.n_jobs)
embedder_list.append(rnnEmbedder)
if args.bert_embedder:
bertEmbedder = BertGen(multilingualIndex, batch_size=args.batch_bert, nepochs=args.nepochs_bert,
patience=args.patience_bert, gpus=args.gpus, n_jobs=args.n_jobs)
bertEmbedder = BertGen(multilingualIndex,
batch_size=args.batch_bert,
nepochs=args.nepochs_bert,
patience=args.patience_bert,
zero_shot=args.zero_shot,
train_langs=args.zscl_langs,
gpus=args.gpus,
n_jobs=args.n_jobs)
embedder_list.append(bertEmbedder)
# Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier
docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True)
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'),
meta_parameters=get_params(optimc=args.optimc),
n_jobs=args.n_jobs)
# Init Funnelling Architecture
gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta, n_jobs=args.n_jobs)
gfun = Funnelling(first_tier=docEmbedders,
meta_classifier=meta,
n_jobs=args.n_jobs)
# Training ---------------------------------------
print('\n[Training Generalized Funnelling]')
@ -70,6 +111,8 @@ def main(args):
# Testing ----------------------------------------
print('\n[Testing Generalized Funnelling]')
time_te = time.time()
if args.zero_shot:
gfun.set_zero_shot(val=False)
ly_ = gfun.predict(lXte)
l_eval = evaluate(ly_true=lyte, ly_pred=ly_, n_jobs=args.n_jobs)
time_te = round(time.time() - time_te, 3)
@ -77,7 +120,7 @@ def main(args):
# Logging ---------------------------------------
print('\n[Results]')
results = CSVlog(args.csv_dir)
results = CSVlog(f'csv_logs/gfun/{args.csv_dir}')
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
@ -99,7 +142,7 @@ def main(args):
microf1=microf1,
macrok=macrok,
microk=microk,
notes='')
notes=f'Train langs: {sorted(args.zscl_langs)}' if args.zero_shot else '')
print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))
overall_time = round(time.time() - time_init, 3)
@ -112,8 +155,8 @@ if __name__ == '__main__':
parser.add_argument('dataset', help='Path to the dataset')
parser.add_argument('-o', '--output', dest='csv_dir', metavar='',
help='Result file (default csv_logs/gfun/gfun_results.csv)', type=str,
default='csv_logs/gfun/gfun_results.csv')
help='Result file saved in csv_logs/gfun/dir, default is gfun_results.csv)', type=str,
default='gfun_results.csv')
parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true',
help='deploy posterior probabilities embedder to compute document embeddings',
@ -186,5 +229,12 @@ if __name__ == '__main__':
parser.add_argument('--gpus', metavar='', help='specifies how many GPUs to use per node',
default=None)
parser.add_argument('--zero_shot', dest='zero_shot', action='store_true',
help='run zero-shot experiments',
default=False)
parser.add_argument('--zscl_langs', dest='zscl_langs', metavar='', nargs='*',
help='set the languages to be used in training in zero shot experiments')
args = parser.parse_args()
main(args)

25
run.sh
View File

@ -1,8 +1,23 @@
#!/usr/bin/env bash
python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -g --gpus 0
echo Running Zero-shot experiments [output at csv_logs/gfun/zero_shot_gfun.csv]
#for i in {0..10..1}
#do
# python main.py --gpus 0
#done
#python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 -o zero_shot_gfun.csv --zero_shot --zscl_langs da
#python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 -o zero_shot_gfun.csv --zero_shot --zscl_langs da de
#python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en
#python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en es
#python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en es fr
#python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en es fr it
#python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en es fr it nl
#python main.py ../datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en es fr it nl pt
#python main.py ../datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en es fr it nl pt sv
#python main.py ../datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 --muse_dir ../embeddings/MUSE/ -o zero_shot_gfun.csv --zero_shot --zscl_langs da
#python main.py ../datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 --muse_dir ../embeddings/MUSE/ -o zero_shot_gfun.csv --zero_shot --zscl_langs da de
#python main.py ../datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 --muse_dir ../embeddings/MUSE/ -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en
#python main.py ../datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 --muse_dir ../embeddings/MUSE/ -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en es
#python main.py ../datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 --muse_dir ../embeddings/MUSE/ -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en es fr
#python main.py ../datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 --muse_dir ../embeddings/MUSE/ -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en es fr it
#python main.py ../datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 --muse_dir ../embeddings/MUSE/ -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en es fr it nl
python main.py ../datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 --muse_dir ../embeddings/MUSE/ -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en es fr it nl pt
python main.py ../datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -x -m -w -b -c --nepochs_bert 25 --n_jobs 6 --gpus 0 --muse_dir ../embeddings/MUSE/ -o zero_shot_gfun.csv --zero_shot --zscl_langs da de en es fr it nl pt sv

View File

@ -92,7 +92,7 @@ class RecurrentDataModule(pl.LightningDataModule):
Pytorch Lightning Datamodule to be deployed with RecurrentGen.
https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
"""
def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1):
def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1, zero_shot=False, zscl_langs=None):
"""
Init RecurrentDataModule.
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
@ -103,6 +103,11 @@ class RecurrentDataModule(pl.LightningDataModule):
self.multilingualIndex = multilingualIndex
self.batchsize = batchsize
self.n_jobs = n_jobs
# Zero shot arguments
if zscl_langs is None:
zscl_langs = []
self.zero_shot = zero_shot
self.train_langs = zscl_langs
super().__init__()
def prepare_data(self, *args, **kwargs):
@ -110,7 +115,10 @@ class RecurrentDataModule(pl.LightningDataModule):
def setup(self, stage=None):
if stage == 'fit' or stage is None:
l_train_index, l_train_target = self.multilingualIndex.l_train()
if self.zero_shot:
l_train_index, l_train_target = self.multilingualIndex.l_train_zero_shot(langs=self.train_langs)
else:
l_train_index, l_train_target = self.multilingualIndex.l_train()
# Debug settings: reducing number of samples
# l_train_index = {l: train[:5] for l, train in l_train_index.items()}
# l_train_target = {l: target[:5] for l, target in l_train_target.items()}
@ -118,7 +126,10 @@ class RecurrentDataModule(pl.LightningDataModule):
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
lPad_index=self.multilingualIndex.l_pad())
l_val_index, l_val_target = self.multilingualIndex.l_val()
if self.zero_shot:
l_val_index, l_val_target = self.multilingualIndex.l_val_zero_shot(langs=self.train_langs)
else:
l_val_index, l_val_target = self.multilingualIndex.l_val()
# Debug settings: reducing number of samples
# l_val_index = {l: train[:5] for l, train in l_val_index.items()}
# l_val_target = {l: target[:5] for l, target in l_val_target.items()}
@ -126,7 +137,10 @@ class RecurrentDataModule(pl.LightningDataModule):
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
lPad_index=self.multilingualIndex.l_pad())
if stage == 'test' or stage is None:
l_test_index, l_test_target = self.multilingualIndex.l_test()
if self.zero_shot:
l_test_index, l_test_target = self.multilingualIndex.l_test_zero_shot(langs=self.train_langs)
else:
l_test_index, l_test_target = self.multilingualIndex.l_test()
# Debug settings: reducing number of samples
# l_test_index = {l: train[:5] for l, train in l_test_index.items()}
# l_test_target = {l: target[:5] for l, target in l_test_target.items()}
@ -136,7 +150,7 @@ class RecurrentDataModule(pl.LightningDataModule):
def train_dataloader(self):
return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
collate_fn=self.training_dataset.collate_fn)
collate_fn=self.training_dataset.collate_fn, shuffle=True)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
@ -167,7 +181,8 @@ class BertDataModule(RecurrentDataModule):
Pytorch Lightning Datamodule to be deployed with BertGen.
https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
"""
def __init__(self, multilingualIndex, batchsize=64, max_len=512):
def __init__(self, multilingualIndex, batchsize=64, max_len=512, zero_shot=False, zscl_langs=None, debug=False,
max_samples=50):
"""
Init BertDataModule.
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
@ -177,32 +192,53 @@ class BertDataModule(RecurrentDataModule):
"""
super().__init__(multilingualIndex, batchsize)
self.max_len = max_len
# Zero shot arguments
if zscl_langs is None:
zscl_langs = []
self.zero_shot = zero_shot
self.train_langs = zscl_langs
self.debug = debug
self.max_samples = max_samples
if self.debug:
print(f'\n[Running on DEBUG mode - samples per language are reduced to {self.max_samples} max!]\n')
def setup(self, stage=None):
if stage == 'fit' or stage is None:
l_train_raw, l_train_target = self.multilingualIndex.l_train_raw()
# Debug settings: reducing number of samples
# l_train_raw = {l: train[:5] for l, train in l_train_raw.items()}
# l_train_target = {l: target[:5] for l, target in l_train_target.items()}
if self.zero_shot:
l_train_raw, l_train_target = self.multilingualIndex.l_train_raw_zero_shot(langs=self.train_langs)
else:
l_train_raw, l_train_target = self.multilingualIndex.l_train_raw()
if self.debug:
# Debug settings: reducing number of samples
l_train_raw = {l: train[:self.max_samples] for l, train in l_train_raw.items()}
l_train_target = {l: target[:self.max_samples] for l, target in l_train_target.items()}
l_train_index = tokenize(l_train_raw, max_len=self.max_len)
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
lPad_index=self.multilingualIndex.l_pad())
l_val_raw, l_val_target = self.multilingualIndex.l_val_raw()
# Debug settings: reducing number of samples
# l_val_raw = {l: train[:5] for l, train in l_val_raw.items()}
# l_val_target = {l: target[:5] for l, target in l_val_target.items()}
if self.zero_shot:
l_val_raw, l_val_target = self.multilingualIndex.l_val_raw_zero_shot(langs=self.train_langs)
else:
l_val_raw, l_val_target = self.multilingualIndex.l_val_raw()
if self.debug:
# Debug settings: reducing number of samples
l_val_raw = {l: train[:self.max_samples] for l, train in l_val_raw.items()}
l_val_target = {l: target[:self.max_samples] for l, target in l_val_target.items()}
l_val_index = tokenize(l_val_raw, max_len=self.max_len)
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
lPad_index=self.multilingualIndex.l_pad())
if stage == 'test' or stage is None:
l_test_raw, l_test_target = self.multilingualIndex.l_test_raw()
# Debug settings: reducing number of samples
# l_test_raw = {l: train[:5] for l, train in l_test_raw.items()}
# l_test_target = {l: target[:5] for l, target in l_test_target.items()}
if self.zero_shot:
l_test_raw, l_test_target = self.multilingualIndex.l_test_raw_zero_shot(langs=self.train_langs)
else:
l_test_raw, l_test_target = self.multilingualIndex.l_test_raw()
if self.debug:
# Debug settings: reducing number of samples
l_test_raw = {l: train[:self.max_samples] for l, train in l_test_raw.items()}
l_test_target = {l: target[:self.max_samples] for l, target in l_test_target.items()}
l_test_index = tokenize(l_test_raw, max_len=self.max_len)
self.test_dataset = RecurrentDataset(l_test_index, l_test_target,
@ -213,10 +249,17 @@ class BertDataModule(RecurrentDataModule):
NB: Setting n_workers to > 0 will cause "OSError: [Errno 24] Too many open files"
:return:
"""
return DataLoader(self.training_dataset, batch_size=self.batchsize)
return DataLoader(self.training_dataset, batch_size=self.batchsize, collate_fn=self.collate_fn_bert,
shuffle=True)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size=self.batchsize)
return DataLoader(self.val_dataset, batch_size=self.batchsize, collate_fn=self.collate_fn_bert)
def test_dataloader(self):
return DataLoader(self.test_dataset, batch_size=self.batchsize)
return DataLoader(self.test_dataset, batch_size=self.batchsize, collate_fn=self.collate_fn_bert)
def collate_fn_bert(self, data):
x_batch = np.vstack([elem[0] for elem in data])
y_batch = np.vstack([elem[1] for elem in data])
lang_batch = [elem[2] for elem in data]
return torch.LongTensor(x_batch), torch.FloatTensor(y_batch), lang_batch

View File

@ -23,7 +23,7 @@ class DocEmbedderList:
if isinstance(embedder, VanillaFunGen):
_tmp.append(embedder)
else:
_tmp.append(FeatureSet2Posteriors(embedder))
_tmp.append(FeatureSet2Posteriors(embedder, n_jobs=embedder.n_jobs))
self.embedders = _tmp
def fit(self, lX, ly):
@ -43,23 +43,38 @@ class DocEmbedderList:
:param lX:
:return: common latent space (averaged).
"""
langs = sorted(lX.keys())
lZparts = {lang: None for lang in langs}
self.langs = sorted(lX.keys())
lZparts = {lang: None for lang in self.langs}
for embedder in self.embedders:
lZ = embedder.transform(lX)
for lang in langs:
for lang in sorted(lZ.keys()):
Z = lZ[lang]
if lZparts[lang] is None:
lZparts[lang] = Z
else:
lZparts[lang] += Z
n_embedders = len(self.embedders)
return {lang: lZparts[lang]/n_embedders for lang in langs} # Averaging feature spaces
# Zero shot experiments: removing k:v if v is None (i.e, it is a lang that will be used in zero shot setting)
lZparts = {k: v for k, v in lZparts.items() if v is not None}
lang_number_embedders = self.get_number_embedders_zeroshot()
return {lang: lZparts[lang]/lang_number_embedders[lang] for lang in sorted(lZparts.keys())} # Averaging feature spaces
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
def get_number_embedders_zeroshot(self):
lang_number_embedders = {lang: len(self.embedders) for lang in self.langs}
for lang in self.langs:
for embedder in self.embedders:
if isinstance(embedder, VanillaFunGen):
if lang not in embedder.train_langs:
lang_number_embedders[lang] = 2 # todo: number of view gen is hard-codede
else:
if lang not in embedder.embedder.train_langs:
lang_number_embedders[lang] = 2 # todo: number of view gen is hard-codede
return lang_number_embedders
class FeatureSet2Posteriors:
"""
@ -77,7 +92,7 @@ class FeatureSet2Posteriors:
self.l2 = l2
self.n_jobs = n_jobs
self.prob_classifier = MetaClassifier(
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=self.n_jobs)
def fit(self, lX, ly):
lZ = self.embedder.fit_transform(lX, ly)
@ -113,12 +128,21 @@ class Funnelling:
self.n_jobs = n_jobs
def fit(self, lX, ly):
print('## Fitting first-tier learners!')
print('\n## Fitting first-tier learners!')
lZ = self.first_tier.fit_transform(lX, ly)
print('## Fitting meta-learner!')
print('\n## Fitting meta-learner!')
self.meta.fit(lZ, ly)
def predict(self, lX):
lZ = self.first_tier.transform(lX)
ly = self.meta.predict(lZ)
return ly
def set_zero_shot(self, val: bool):
for embedder in self.first_tier.embedders:
if isinstance(embedder, VanillaFunGen):
embedder.set_zero_shot(val)
else:
embedder.embedder.set_zero_shot(val)
return

View File

@ -1,4 +1,5 @@
import time
import src.util.disable_sklearn_warnings
import numpy as np
from joblib import Parallel, delayed
@ -74,7 +75,7 @@ class NaivePolylingualClassifier:
_sort_if_sparse(lX[lang])
models = Parallel(n_jobs=self.n_jobs)\
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]), ly[lang]) for
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters, n_jobs=self.n_jobs).fit)((lX[lang]), ly[lang]) for
lang in langs)
self.model = {lang: models[i] for i, lang in enumerate(langs)}

View File

@ -23,7 +23,7 @@ class BertModel(pl.LightningModule):
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
# Language specific metrics to compute metrics at epoch level
# Language specific metrics to compute at epoch level
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
@ -44,9 +44,7 @@ class BertModel(pl.LightningModule):
return logits
def training_step(self, train_batch, batch_idx):
X, y, _, batch_langs = train_batch
X = torch.cat(X).view([X[0].shape[0], len(X)])
y = y.type(torch.FloatTensor)
X, y, batch_langs = train_batch
y = y.to('cuda' if self.gpus else 'cpu')
logits, _ = self.forward(X)
loss = self.loss(logits, y)
@ -56,52 +54,15 @@ class BertModel(pl.LightningModule):
macroF1 = self.macroF1(predictions, y)
microK = self.microK(predictions, y)
macroK = self.macroK(predictions, y)
self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
lX, ly = self._reconstruct_dict(predictions, y, batch_langs)
return {'loss': loss, 'pred': lX, 'target': ly}
def training_epoch_end(self, outputs):
langs = []
for output in outputs:
langs.extend(list(output['pred'].keys()))
langs = set(langs)
# outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
# here we save epoch level metric values and compute them specifically for each language
res_macroF1 = {lang: [] for lang in langs}
res_microF1 = {lang: [] for lang in langs}
res_macroK = {lang: [] for lang in langs}
res_microK = {lang: [] for lang in langs}
for output in outputs:
lX, ly = output['pred'], output['target']
for lang in lX.keys():
X, y = lX[lang], ly[lang]
lang_macroF1 = self.lang_macroF1(X, y)
lang_microF1 = self.lang_microF1(X, y)
lang_macroK = self.lang_macroK(X, y)
lang_microK = self.lang_microK(X, y)
res_macroF1[lang].append(lang_macroF1)
res_microF1[lang].append(lang_microF1)
res_macroK[lang].append(lang_macroK)
res_microK[lang].append(lang_microK)
for lang in langs:
avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang]))
avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang]))
avg_macroK = torch.mean(torch.Tensor(res_macroK[lang]))
avg_microK = torch.mean(torch.Tensor(res_microK[lang]))
self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch)
self.log('train-loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('train-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('train-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('train-macroK', macroK, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('train-microK', microK, on_step=False, on_epoch=True, prog_bar=False, logger=True)
return {'loss': loss}
def validation_step(self, val_batch, batch_idx):
X, y, _, batch_langs = val_batch
X = torch.cat(X).view([X[0].shape[0], len(X)])
y = y.type(torch.FloatTensor)
X, y, batch_langs = val_batch
y = y.to('cuda' if self.gpus else 'cpu')
logits, _ = self.forward(X)
loss = self.loss(logits, y)
@ -110,7 +71,7 @@ class BertModel(pl.LightningModule):
macroF1 = self.macroF1(predictions, y)
microK = self.microK(predictions, y)
macroK = self.macroK(predictions, y)
self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
@ -118,12 +79,10 @@ class BertModel(pl.LightningModule):
return {'loss': loss}
def test_step(self, test_batch, batch_idx):
X, y, _, batch_langs = test_batch
X = torch.cat(X).view([X[0].shape[0], len(X)])
y = y.type(torch.FloatTensor)
X, y, batch_langs = test_batch
y = y.to('cuda' if self.gpus else 'cpu')
logits, _ = self.forward(X)
loss = self.loss(logits, y)
# loss = self.loss(logits, y)
# Squashing logits through Sigmoid in order to get confidence score
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, y)
@ -132,11 +91,11 @@ class BertModel(pl.LightningModule):
macroK = self.macroK(predictions, y)
self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=False, logger=True)
return
def configure_optimizers(self, lr=3e-5, weight_decay=0.01):
def configure_optimizers(self, lr=1e-5, weight_decay=0.01):
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in self.bert.named_parameters()
@ -147,7 +106,8 @@ class BertModel(pl.LightningModule):
'weight_decay': weight_decay}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = StepLR(optimizer, step_size=25, gamma=0.1)
scheduler = {'scheduler': StepLR(optimizer, step_size=25, gamma=0.1),
'interval': 'epoch'}
return [optimizer], [scheduler]
def encode(self, lX, batch_size=64):

View File

@ -42,7 +42,7 @@ class RecurrentModel(pl.LightningModule):
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
# Language specific metrics to compute metrics at epoch level
# Language specific metrics to compute at epoch level
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)

View File

@ -149,33 +149,60 @@ class MultilingualIndex:
def l_train_index(self):
return {l: index.train_index for l, index in self.l_index.items()}
def l_train_index_zero_shot(self, langs):
return {l: index.train_index for l, index in self.l_index.items() if l in langs}
def l_train_raw_index(self):
return {l: index.train_raw for l, index in self.l_index.items()}
def l_train_raw_index_zero_shot(self, langs):
return {l: index.train_raw for l, index in self.l_index.items() if l in langs}
def l_train_target(self):
return {l: index.train_target for l, index in self.l_index.items()}
def l_train_target_zero_shot(self, langs):
return {l: index.train_target for l, index in self.l_index.items() if l in langs}
def l_val_index(self):
return {l: index.val_index for l, index in self.l_index.items()}
def l_val_index_zero_shot(self, langs):
return {l: index.val_index for l, index in self.l_index.items() if l in langs}
def l_val_raw_index(self):
return {l: index.val_raw for l, index in self.l_index.items()}
def l_val_raw_index_zero_shot(self, langs):
return {l: index.val_raw for l, index in self.l_index.items() if l in langs}
def l_test_raw_index(self):
return {l: index.test_raw for l, index in self.l_index.items()}
def l_test_raw_index_zero_shot(self, langs):
return {l: index.test_raw for l, index in self.l_index.items() if l in langs}
def l_devel_raw_index(self):
return {l: index.devel_raw for l, index in self.l_index.items()}
def l_val_target(self):
return {l: index.val_target for l, index in self.l_index.items()}
def l_val_target_zero_shot(self, langs):
return {l: index.val_target for l, index in self.l_index.items() if l in langs}
def l_test_target(self):
return {l: index.test_target for l, index in self.l_index.items()}
def l_test_index(self):
return {l: index.test_index for l, index in self.l_index.items()}
def l_test_target_zero_shot(self, langs):
return {l: index.test_target for l, index in self.l_index.items() if l in langs}
def l_test_index_zero_shot(self, langs):
return {l: index.test_index for l, index in self.l_index.items() if l in langs}
def l_devel_index(self):
return {l: index.devel_index for l, index in self.l_index.items()}
@ -191,15 +218,33 @@ class MultilingualIndex:
def l_test(self):
return self.l_test_index(), self.l_test_target()
def l_test_zero_shot(self, langs):
return self.l_test_index_zero_shot(langs), self.l_test_target_zero_shot(langs)
def l_train_zero_shot(self, langs):
return self.l_train_index_zero_shot(langs), self.l_train_target_zero_shot(langs)
def l_val_zero_shot(self, langs):
return self.l_val_index_zero_shot(langs), self.l_val_target_zero_shot(langs)
def l_train_raw(self):
return self.l_train_raw_index(), self.l_train_target()
def l_train_raw_zero_shot(self, langs):
return self.l_train_raw_index_zero_shot(langs), self.l_train_target_zero_shot(langs)
def l_val_raw(self):
return self.l_val_raw_index(), self.l_val_target()
def l_val_raw_zero_shot(self, langs):
return self.l_val_raw_index_zero_shot(langs), self.l_val_target_zero_shot(langs)
def l_test_raw(self):
return self.l_test_raw_index(), self.l_test_target()
def l_test_raw_zero_shot(self, langs):
return self.l_test_raw_index_zero_shot(langs), self.l_test_target_zero_shot(langs)
def l_devel_raw(self):
return self.l_devel_raw_index(), self.l_devel_target()
@ -317,7 +362,6 @@ def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
unk_count = 0
knw_count = 0
out_count = 0
# pbar = tqdm(data, desc=f'indexing')
for text in data:
words = analyzer(text)
index = []
@ -336,8 +380,6 @@ def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
index.append(idx)
indexes.append(index)
knw_count += len(index)
# pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
# f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
return indexes

View File

@ -0,0 +1,8 @@
import warnings
def warn(*args, **kwargs):
pass
warnings.warn = warn

View File

@ -118,6 +118,7 @@ def hard_single_metric_statistics(true_labels, predicted_labels):
def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
_tmp = [metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)]
return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)])

View File

@ -68,7 +68,7 @@ class CustomF1(Metric):
if den > 0:
class_specific.append(num / den)
else:
class_specific.append(1.)
class_specific.append(torch.FloatTensor([1.]))
average = torch.sum(torch.Tensor(class_specific))/self.num_classes
return average.to(self.device)

View File

@ -1,4 +1,5 @@
import numpy as np
import src.util.disable_sklearn_warnings
class StandardizeTransformer:

View File

@ -15,12 +15,12 @@ This module contains the view generators that take care of computing the view sp
- View generator (-b): generates document embedding via mBERT model.
"""
import torch
from abc import ABC, abstractmethod
# from time import time
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
@ -57,7 +57,7 @@ class VanillaFunGen(ViewGen):
View Generator (x): original funnelling architecture proposed by Moreo, Esuli and
Sebastiani in DOI: https://doi.org/10.1145/3326065
"""
def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1):
def __init__(self, base_learner, first_tier_parameters=None, zero_shot=False, train_langs: list = None, n_jobs=-1):
"""
Init Posterior Probabilities embedder (i.e., VanillaFunGen)
:param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to
@ -70,13 +70,26 @@ class VanillaFunGen(ViewGen):
self.first_tier_parameters = first_tier_parameters
self.n_jobs = n_jobs
self.doc_projector = NaivePolylingualClassifier(base_learner=self.learners,
parameters=self.first_tier_parameters, n_jobs=self.n_jobs)
parameters=self.first_tier_parameters,
n_jobs=self.n_jobs)
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
# Zero shot parameters
self.zero_shot = zero_shot
if train_langs is None:
train_langs = ['it']
self.train_langs = train_langs
def fit(self, lX, lY):
print('# Fitting VanillaFunGen (X)...')
lX = self.vectorizer.fit_transform(lX)
self.doc_projector.fit(lX, lY)
def fit(self, lX, ly):
print('\n# Fitting VanillaFunGen (X)...')
if self.zero_shot:
print(f'# Zero-shot setting! Training langs will be set to: {sorted(self.train_langs)}')
self.langs = sorted(self.train_langs)
lX = self.zero_shot_experiments(lX)
ly = self.zero_shot_experiments(ly)
lX = self.vectorizer.fit_transform(lX)
else:
lX = self.vectorizer.fit_transform(lX)
self.doc_projector.fit(lX, ly)
return self
def transform(self, lX):
@ -94,13 +107,27 @@ class VanillaFunGen(ViewGen):
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
def zero_shot_experiments(self, lX):
_lX = {}
for lang in self.langs:
if lang in self.train_langs:
_lX[lang] = lX[lang]
else:
_lX[lang] = None
lX = _lX
return lX
def set_zero_shot(self, val: bool):
self.zero_shot = val
return
class MuseGen(ViewGen):
"""
View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
"""
def __init__(self, muse_dir='../embeddings', n_jobs=-1):
def __init__(self, muse_dir='../embeddings', zero_shot=False, train_langs: list = None, n_jobs=-1):
"""
Init the MuseGen.
:param muse_dir: string, path to folder containing muse embeddings
@ -112,6 +139,11 @@ class MuseGen(ViewGen):
self.langs = None
self.lMuse = None
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
# Zero shot parameters
self.zero_shot = zero_shot
if train_langs is None:
train_langs = ['it']
self.train_langs = train_langs
def fit(self, lX, ly):
"""
@ -120,7 +152,9 @@ class MuseGen(ViewGen):
:param ly: dict {lang: target vectors}
:return: self.
"""
print('# Fitting MuseGen (M)...')
print('\n# Fitting MuseGen (M)...')
if self.zero_shot:
print(f'# Zero-shot setting! Training langs will be set to: {sorted(self.train_langs)}')
self.vectorizer.fit(lX)
self.langs = sorted(lX.keys())
self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir)
@ -136,23 +170,42 @@ class MuseGen(ViewGen):
:param lX: dict {lang: indexed documents}
:return: document projection to the common latent space.
"""
lX = self.vectorizer.transform(lX)
# Testing zero-shot experiments
if self.zero_shot:
lX = self.zero_shot_experiments(lX)
lX = {l: self.vectorizer.vectorizer[l].transform(lX[l]) for l in self.langs if lX[l] is not None}
else:
lX = self.vectorizer.transform(lX)
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs)
lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)}
delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in sorted(lX.keys()))
lZ = {lang: XdotMUSE[i] for i, lang in enumerate(sorted(lX.keys()))}
lZ = _normalize(lZ, l2=True)
return lZ
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
def zero_shot_experiments(self, lX):
_lX = {}
for lang in self.langs:
if lang in self.train_langs:
_lX[lang] = lX[lang]
else:
_lX[lang] = None
lX = _lX
return lX
def set_zero_shot(self, val: bool):
self.zero_shot = val
return
class WordClassGen(ViewGen):
"""
View Generator (w): generates document representation via Word-Class-Embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings.
"""
def __init__(self, n_jobs=-1):
def __init__(self, zero_shot=False, train_langs: list = None, n_jobs=-1):
"""
Init WordClassGen.
:param n_jobs: int, number of concurrent workers
@ -162,6 +215,11 @@ class WordClassGen(ViewGen):
self.langs = None
self.lWce = None
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
# Zero shot parameters
self.zero_shot = zero_shot
if train_langs is None:
train_langs = ['it']
self.train_langs = train_langs
def fit(self, lX, ly):
"""
@ -170,11 +228,18 @@ class WordClassGen(ViewGen):
:param ly: dict {lang: target vectors}
:return: self.
"""
print('# Fitting WordClassGen (W)...')
lX = self.vectorizer.fit_transform(lX)
self.langs = sorted(lX.keys())
print('\n# Fitting WordClassGen (W)...')
if self.zero_shot:
print(f'# Zero-shot setting! Training langs will be set to: {sorted(self.train_langs)}')
self.langs = sorted(self.train_langs)
lX = self.zero_shot_experiments(lX)
lX = self.vectorizer.fit_transform(lX)
else:
lX = self.vectorizer.fit_transform(lX)
self.langs = sorted(lX.keys())
wce = Parallel(n_jobs=self.n_jobs)(
delayed(wce_matrix)(lX[lang], ly[lang]) for lang in self.langs)
delayed(wce_matrix)(lX[lang], ly[lang]) for lang in self.langs)
self.lWce = {l: wce[i] for i, l in enumerate(self.langs)}
# TODO: featureweight.fit()
return self
@ -188,14 +253,28 @@ class WordClassGen(ViewGen):
"""
lX = self.vectorizer.transform(lX)
XdotWce = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs)
lWce = {l: XdotWce[i] for i, l in enumerate(self.langs)}
delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in sorted(lX.keys()) if lang in self.lWce.keys())
lWce = {l: XdotWce[i] for i, l in enumerate(sorted(lX.keys())) if l in self.lWce.keys()}
lWce = _normalize(lWce, l2=True)
return lWce
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
def zero_shot_experiments(self, lX):
_lX = {}
for lang in self.langs:
if lang in self.train_langs:
_lX[lang] = lX[lang]
else:
_lX[lang] = None
lX = _lX
return lX
def set_zero_shot(self, val: bool):
self.zero_shot = val
return
class RecurrentGen(ViewGen):
"""
@ -205,7 +284,7 @@ class RecurrentGen(ViewGen):
the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard.
"""
def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50,
gpus=0, n_jobs=-1, patience=20, stored_path=None):
gpus=0, n_jobs=-1, patience=20, stored_path=None, zero_shot=False, train_langs: list = None):
"""
Init RecurrentGen.
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
@ -239,15 +318,17 @@ class RecurrentGen(ViewGen):
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce)
self.model = self._init_model()
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False)
self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False)
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
patience=self.patience, verbose=False, mode='max')
# modifying EarlyStopping global var in order to compute >= with respect to the best score
self.early_stop_callback.mode_dict['max'] = torch.ge
self.lr_monitor = LearningRateMonitor(logging_interval='epoch')
# Zero shot parameters
self.zero_shot = zero_shot
if train_langs is None:
train_langs = ['it']
self.train_langs = train_langs
def _init_model(self):
if self.stored_path:
lpretrained = self.multilingualIndex.l_embeddings()
@ -280,18 +361,16 @@ class RecurrentGen(ViewGen):
:param ly: dict {lang: target vectors}
:return: self.
"""
print('# Fitting RecurrentGen (G)...')
print('\n# Fitting RecurrentGen (G)...')
create_if_not_exist(self.logger.save_dir)
recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs)
recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs,
zero_shot=self.zero_shot, zscl_langs=self.train_langs)
trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs,
callbacks=[self.early_stop_callback, self.lr_monitor], checkpoint_callback=False)
callbacks=[self.early_stop_callback, self.lr_monitor], checkpoint_callback=False,
overfit_batches=0.01)
# vanilla_torch_model = torch.load(
# '../_old_checkpoint/gru_viewgen_-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle')
# self.model.linear0 = vanilla_torch_model.linear0
# self.model.linear1 = vanilla_torch_model.linear1
# self.model.linear2 = vanilla_torch_model.linear2
# self.model.rnn = vanilla_torch_model.rnn
if self.zero_shot:
print(f'# Zero-shot setting! Training langs will be set to: {sorted(self.train_langs)}')
trainer.fit(self.model, datamodule=recurrentDataModule)
trainer.test(self.model, datamodule=recurrentDataModule)
@ -303,6 +382,8 @@ class RecurrentGen(ViewGen):
:param lX: dict {lang: indexed documents}
:return: documents projected to the common latent space.
"""
if self.zero_shot:
lX = self.zero_shot_experiments(lX)
data = {}
for lang in lX.keys():
indexed = index(data=lX[lang],
@ -321,6 +402,16 @@ class RecurrentGen(ViewGen):
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
def zero_shot_experiments(self, lX):
for lang in sorted(lX.keys()):
if lang not in self.train_langs:
lX.pop(lang)
return lX
def set_zero_shot(self, val: bool):
self.zero_shot = val
return
class BertGen(ViewGen):
"""
@ -328,7 +419,8 @@ class BertGen(ViewGen):
At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document
embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard.
"""
def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, patience=5, stored_path=None):
def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, patience=5, stored_path=None,
zero_shot=False, train_langs: list = None):
"""
Init Bert model
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
@ -349,13 +441,20 @@ class BertGen(ViewGen):
self.stored_path = stored_path
self.model = self._init_model()
self.patience = patience
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False)
# self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False)
self.logger = CSVLogger(save_dir='csv_logs', name='bert')
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
patience=self.patience, verbose=False, mode='max')
# modifying EarlyStopping global var in order to compute >= with respect to the best score
self.early_stop_callback.mode_dict['max'] = torch.ge
# Zero shot parameters
self.zero_shot = zero_shot
if train_langs is None:
train_langs = ['it']
self.train_langs = train_langs
def _init_model(self):
output_size = self.multilingualIndex.get_target_dim()
return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus)
@ -369,12 +468,21 @@ class BertGen(ViewGen):
:param ly: dict {lang: target vectors}
:return: self.
"""
print('# Fitting BertGen (B)...')
print('\n# Fitting BertGen (B)...')
create_if_not_exist(self.logger.save_dir)
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512)
trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus,
logger=self.logger, callbacks=[self.early_stop_callback], checkpoint_callback=False)
bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512,
zero_shot=self.zero_shot, zscl_langs=self.train_langs,
debug=False, max_samples=50)
if self.zero_shot:
print(f'# Zero-shot setting! Training langs will be set to: {sorted(self.train_langs)}')
trainer = Trainer(max_epochs=self.nepochs, gpus=self.gpus,
logger=self.logger,
callbacks=[self.early_stop_callback],
checkpoint_callback=False)
trainer.fit(self.model, datamodule=bertDataModule)
trainer.test(self.model, datamodule=bertDataModule)
return self
@ -385,6 +493,8 @@ class BertGen(ViewGen):
:param lX: dict {lang: indexed documents}
:return: documents projected to the common latent space.
"""
if self.zero_shot:
lX = self.zero_shot_experiments(lX)
data = tokenize(lX, max_len=512)
self.model.to('cuda' if self.gpus else 'cpu')
self.model.eval()
@ -394,3 +504,13 @@ class BertGen(ViewGen):
def fit_transform(self, lX, ly):
# we can assume that we have already indexed data for transform() since we are first calling fit()
return self.fit(lX, ly).transform(lX)
def zero_shot_experiments(self, lX):
for lang in sorted(lX.keys()):
if lang not in self.train_langs:
lX.pop(lang)
return lX
def set_zero_shot(self, val: bool):
self.zero_shot = val
return