merged devel
This commit is contained in:
commit
e78b1f8a30
|
@ -0,0 +1,190 @@
|
|||
from argparse import ArgumentParser
|
||||
|
||||
from src.data.dataset_builder import MultilingualDataset
|
||||
from src.funnelling import *
|
||||
from src.util.common import MultilingualIndex, get_params, get_method_name
|
||||
from src.util.evaluation import evaluate
|
||||
from src.util.results_csv import CSVlog
|
||||
from src.view_generators import *
|
||||
|
||||
|
||||
def main(args):
|
||||
assert args.post_embedder or args.muse_embedder or args.wce_embedder or args.gru_embedder or args.bert_embedder, \
|
||||
'empty set of document embeddings is not allowed!'
|
||||
|
||||
print('Running generalized funnelling...')
|
||||
|
||||
data = MultilingualDataset.load(args.dataset)
|
||||
# data.set_view(languages=['it', 'da'])
|
||||
data.show_dimensions()
|
||||
lX, ly = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
# Init multilingualIndex - mandatory when deploying Neural View Generators...
|
||||
if args.gru_embedder or args.bert_embedder:
|
||||
multilingualIndex = MultilingualIndex()
|
||||
lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir)
|
||||
multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary())
|
||||
|
||||
# Init ViewGenerators and append them to embedder_list
|
||||
embedder_list = []
|
||||
if args.post_embedder:
|
||||
posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs)
|
||||
embedder_list.append(posteriorEmbedder)
|
||||
|
||||
if args.muse_embedder:
|
||||
museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs)
|
||||
embedder_list.append(museEmbedder)
|
||||
|
||||
if args.wce_embedder:
|
||||
wceEmbedder = WordClassGen(n_jobs=args.n_jobs)
|
||||
embedder_list.append(wceEmbedder)
|
||||
|
||||
if args.gru_embedder:
|
||||
rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.rnn_wce,
|
||||
batch_size=args.batch_rnn, nepochs=args.nepochs_rnn, patience=args.patience_rnn,
|
||||
gpus=args.gpus, n_jobs=args.n_jobs)
|
||||
embedder_list.append(rnnEmbedder)
|
||||
|
||||
if args.bert_embedder:
|
||||
bertEmbedder = BertGen(multilingualIndex, batch_size=args.batch_bert, nepochs=args.nepochs_bert,
|
||||
patience=args.patience_bert, gpus=args.gpus, n_jobs=args.n_jobs)
|
||||
bertEmbedder.transform(lX)
|
||||
embedder_list.append(bertEmbedder)
|
||||
|
||||
# Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier
|
||||
docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True)
|
||||
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
||||
meta_parameters=get_params(optimc=args.optimc))
|
||||
|
||||
# Init Funnelling Architecture
|
||||
gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta)
|
||||
|
||||
# Training ---------------------------------------
|
||||
print('\n[Training Generalized Funnelling]')
|
||||
time_init = time.time()
|
||||
gfun.fit(lX, ly)
|
||||
time_tr = round(time.time() - time_init, 3)
|
||||
print(f'Training completed in {time_tr} seconds!')
|
||||
|
||||
# Testing ----------------------------------------
|
||||
print('\n[Testing Generalized Funnelling]')
|
||||
time_te = time.time()
|
||||
ly_ = gfun.predict(lXte)
|
||||
l_eval = evaluate(ly_true=lyte, ly_pred=ly_)
|
||||
time_te = round(time.time() - time_te, 3)
|
||||
print(f'Testing completed in {time_te} seconds!')
|
||||
|
||||
# Logging ---------------------------------------
|
||||
print('\n[Results]')
|
||||
results = CSVlog(args.csv_dir)
|
||||
metrics = []
|
||||
for lang in lXte.keys():
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}')
|
||||
if results is not None:
|
||||
_id, _dataset = get_method_name(args)
|
||||
results.add_row(method='gfun',
|
||||
setting=_id,
|
||||
optimc=args.optimc,
|
||||
sif='True',
|
||||
zscore='True',
|
||||
l2='True',
|
||||
dataset=_dataset,
|
||||
time_tr=time_tr,
|
||||
time_te=time_te,
|
||||
lang=lang,
|
||||
macrof1=macrof1,
|
||||
microf1=microf1,
|
||||
macrok=macrok,
|
||||
microk=microk,
|
||||
notes='')
|
||||
print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))
|
||||
|
||||
overall_time = round(time.time() - time_init, 3)
|
||||
exit(f'\nExecuted in: {overall_time} seconds!')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser(description='Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani')
|
||||
|
||||
parser.add_argument('dataset', help='Path to the dataset')
|
||||
|
||||
parser.add_argument('-o', '--output', dest='csv_dir', metavar='',
|
||||
help='Result file (default csv_logs/gfun/gfun_results.csv)', type=str,
|
||||
default='csv_logs/gfun/gfun_results.csv')
|
||||
|
||||
parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true',
|
||||
help='deploy posterior probabilities embedder to compute document embeddings',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-w', '--wce_embedder', dest='wce_embedder', action='store_true',
|
||||
help='deploy (supervised) Word-Class embedder to the compute document embeddings',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-m', '--muse_embedder', dest='muse_embedder', action='store_true',
|
||||
help='deploy (pretrained) MUSE embedder to compute document embeddings',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-b', '--bert_embedder', dest='bert_embedder', action='store_true',
|
||||
help='deploy multilingual Bert to compute document embeddings',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-g', '--gru_embedder', dest='gru_embedder', action='store_true',
|
||||
help='deploy a GRU in order to compute document embeddings (a.k.a., RecurrentGen)',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true',
|
||||
help='Optimize SVMs C hyperparameter at metaclassifier level',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, metavar='',
|
||||
help='number of parallel jobs (default is -1, all)',
|
||||
default=-1)
|
||||
|
||||
parser.add_argument('--nepochs_rnn', dest='nepochs_rnn', type=int, metavar='',
|
||||
help='number of max epochs to train Recurrent embedder (i.e., -g), default 150',
|
||||
default=150)
|
||||
|
||||
parser.add_argument('--nepochs_bert', dest='nepochs_bert', type=int, metavar='',
|
||||
help='number of max epochs to train Bert model (i.e., -g), default 10',
|
||||
default=10)
|
||||
|
||||
parser.add_argument('--patience_rnn', dest='patience_rnn', type=int, metavar='',
|
||||
help='set early stop patience for the RecurrentGen, default 25',
|
||||
default=25)
|
||||
|
||||
parser.add_argument('--patience_bert', dest='patience_bert', type=int, metavar='',
|
||||
help='set early stop patience for the BertGen, default 5',
|
||||
default=5)
|
||||
|
||||
parser.add_argument('--batch_rnn', dest='batch_rnn', type=int, metavar='',
|
||||
help='set batchsize for the RecurrentGen, default 64',
|
||||
default=64)
|
||||
|
||||
parser.add_argument('--batch_bert', dest='batch_bert', type=int, metavar='',
|
||||
help='set batchsize for the BertGen, default 4',
|
||||
default=4)
|
||||
|
||||
parser.add_argument('--muse_dir', dest='muse_dir', type=str, metavar='',
|
||||
help='Path to the MUSE polylingual word embeddings (default embeddings/)',
|
||||
default='embeddings/')
|
||||
|
||||
parser.add_argument('--rnn_wce', dest='rnn_wce', action='store_true',
|
||||
help='Deploy WCE embedding as embedding layer of the RecurrentGen',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('--rnn_dir', dest='rnn_dir', type=str, metavar='',
|
||||
help='Set the path to a pretrained RNN model (i.e., -g view generator)',
|
||||
default=None)
|
||||
|
||||
parser.add_argument('--bert_dir', dest='bert_dir', type=str, metavar='',
|
||||
help='Set the path to a pretrained mBERT model (i.e., -b view generator)',
|
||||
default=None)
|
||||
|
||||
parser.add_argument('--gpus', metavar='', help='specifies how many GPUs to use per node',
|
||||
default=None)
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
|
@ -0,0 +1,52 @@
|
|||
# Generalized Funnelling (gFun)
|
||||
|
||||
## Requirements
|
||||
```commandline
|
||||
transformers==2.11.0
|
||||
pandas==0.25.3
|
||||
numpy==1.17.4
|
||||
joblib==0.14.0
|
||||
tqdm==4.50.2
|
||||
pytorch_lightning==1.1.2
|
||||
torch==1.3.1
|
||||
nltk==3.4.5
|
||||
scipy==1.3.3
|
||||
rdflib==4.2.2
|
||||
torchtext==0.4.0
|
||||
scikit_learn==0.24.1
|
||||
```
|
||||
|
||||
## Usage
|
||||
```commandline
|
||||
usage: main.py [-h] [-o CSV_DIR] [-x] [-w] [-m] [-b] [-g] [-c] [-n NEPOCHS]
|
||||
[-j N_JOBS] [--muse_dir MUSE_DIR] [--gru_wce]
|
||||
[--gru_dir GRU_DIR] [--bert_dir BERT_DIR] [--gpus GPUS]
|
||||
dataset
|
||||
|
||||
Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani (2020).
|
||||
|
||||
positional arguments:
|
||||
dataset Path to the dataset
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-o, --output result file (default ../csv_logs/gfun/gfun_results.csv)
|
||||
-x, --post_embedder deploy posterior probabilities embedder to compute document embeddings
|
||||
-w, --wce_embedder deploy (supervised) Word-Class embedder to the compute document embeddings
|
||||
-m, --muse_embedder deploy (pretrained) MUSE embedder to compute document embeddings
|
||||
-b, --bert_embedder deploy multilingual Bert to compute document embeddings
|
||||
-g, --gru_embedder deploy a GRU in order to compute document embeddings
|
||||
-c, --c_optimize optimize SVMs C hyperparameter
|
||||
-j, --n_jobs number of parallel jobs, default is -1 i.e., all
|
||||
--nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150
|
||||
--nepochs_bert number of max epochs to train Bert model (i.e., -g), default 10
|
||||
--patience_rnn set early stop patience for the RecurrentGen, default 25
|
||||
--patience_bert set early stop patience for the BertGen, default 5
|
||||
--batch_rnn set batchsize for the RecurrentGen, default 64
|
||||
--batch_bert set batchsize for the BertGen, default 4
|
||||
--muse_dir path to the MUSE polylingual word embeddings (default ../embeddings)
|
||||
--gru_wce deploy WCE embedding as embedding layer of the GRU View Generator
|
||||
--rnn_dir set the path to a pretrained RNN model (i.e., -g view generator)
|
||||
--bert_dir set the path to a pretrained mBERT model (i.e., -b view generator)
|
||||
--gpus specifies how many GPUs to use per node
|
||||
```
|
|
@ -0,0 +1,12 @@
|
|||
transformers==2.11.0
|
||||
pandas==0.25.3
|
||||
numpy==1.17.4
|
||||
joblib==0.14.0
|
||||
tqdm==4.50.2
|
||||
pytorch_lightning==1.1.2
|
||||
torch==1.3.1
|
||||
nltk==3.4.5
|
||||
scipy==1.3.3
|
||||
rdflib==4.2.2
|
||||
torchtext==0.4.0
|
||||
scikit_learn==0.24.1
|
|
@ -0,0 +1,8 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -g --gpus 0
|
||||
|
||||
#for i in {0..10..1}
|
||||
#do
|
||||
# python main.py --gpus 0
|
||||
#done
|
|
@ -0,0 +1,222 @@
|
|||
import numpy as np
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from transformers import BertTokenizer
|
||||
|
||||
N_WORKERS = 8
|
||||
|
||||
|
||||
class RecurrentDataset(Dataset):
|
||||
def __init__(self, lX, ly, lPad_index):
|
||||
"""
|
||||
:param lX: dict {lang_id : np.ndarray}
|
||||
:param ly:
|
||||
"""
|
||||
self.lX = []
|
||||
self.ly = []
|
||||
self.lOffset = {}
|
||||
self.lPad_index = lPad_index
|
||||
|
||||
for lang, data in lX.items():
|
||||
offset = [len(self.lX)]
|
||||
self.lX.extend(data)
|
||||
offset.append(len(self.lX))
|
||||
self.lOffset[lang] = offset
|
||||
|
||||
for lang, target in ly.items():
|
||||
self.ly.extend(target)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.lX)
|
||||
|
||||
def __getitem__(self, index):
|
||||
X = self.lX[index]
|
||||
y = self.ly[index]
|
||||
return X, y, index, self._get_lang(index)
|
||||
|
||||
def _get_lang(self, index):
|
||||
for lang, l_range in self.lOffset.items():
|
||||
if index in range(l_range[0], l_range[1]):
|
||||
return lang
|
||||
|
||||
def collate_fn(self, data):
|
||||
"""
|
||||
Takes care of padding the batch and also check consistency of batch languages. Groups into dict {lang : lang_batch}
|
||||
items sampled from the Dataset class.
|
||||
:param data:
|
||||
:return:
|
||||
"""
|
||||
lX_batch = {}
|
||||
ly_batch = {}
|
||||
current_lang = data[0][-1]
|
||||
for d in data:
|
||||
if d[-1] == current_lang:
|
||||
if current_lang not in lX_batch.keys():
|
||||
lX_batch[current_lang] = []
|
||||
ly_batch[current_lang] = []
|
||||
lX_batch[current_lang].append(d[0])
|
||||
ly_batch[current_lang].append(d[1])
|
||||
else:
|
||||
current_lang = d[-1]
|
||||
lX_batch[current_lang] = []
|
||||
ly_batch[current_lang] = []
|
||||
lX_batch[current_lang].append(d[0])
|
||||
ly_batch[current_lang].append(d[1])
|
||||
|
||||
for lang in lX_batch.keys():
|
||||
lX_batch[lang] = self.pad(lX_batch[lang], pad_index=self.lPad_index[lang],
|
||||
max_pad_length=self.define_pad_length(lX_batch[lang]))
|
||||
lX_batch[lang] = torch.LongTensor(lX_batch[lang])
|
||||
ly_batch[lang] = torch.FloatTensor(ly_batch[lang])
|
||||
|
||||
return lX_batch, ly_batch
|
||||
|
||||
@staticmethod
|
||||
def define_pad_length(index_list):
|
||||
lengths = [len(index) for index in index_list]
|
||||
return int(np.mean(lengths) + np.std(lengths))
|
||||
|
||||
@staticmethod
|
||||
def pad(index_list, pad_index, max_pad_length=None):
|
||||
pad_length = np.max([len(index) for index in index_list])
|
||||
if max_pad_length is not None:
|
||||
pad_length = min(pad_length, max_pad_length)
|
||||
for i, indexes in enumerate(index_list):
|
||||
index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length]
|
||||
return index_list
|
||||
|
||||
|
||||
class RecurrentDataModule(pl.LightningDataModule):
|
||||
"""
|
||||
Pytorch Lightning Datamodule to be deployed with RecurrentGen.
|
||||
https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
|
||||
"""
|
||||
def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1):
|
||||
"""
|
||||
Init RecurrentDataModule.
|
||||
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
|
||||
indexed by language code.
|
||||
:param batchsize: int, number of sample per batch.
|
||||
:param n_jobs: int, number of concurrent workers to be deployed (i.e., parallelizing data loading).
|
||||
"""
|
||||
self.multilingualIndex = multilingualIndex
|
||||
self.batchsize = batchsize
|
||||
self.n_jobs = n_jobs
|
||||
super().__init__()
|
||||
|
||||
def prepare_data(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def setup(self, stage=None):
|
||||
if stage == 'fit' or stage is None:
|
||||
l_train_index, l_train_target = self.multilingualIndex.l_train()
|
||||
# Debug settings: reducing number of samples
|
||||
# l_train_index = {l: train[:5] for l, train in l_train_index.items()}
|
||||
# l_train_target = {l: target[:5] for l, target in l_train_target.items()}
|
||||
|
||||
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
|
||||
lPad_index=self.multilingualIndex.l_pad())
|
||||
|
||||
l_val_index, l_val_target = self.multilingualIndex.l_val()
|
||||
# Debug settings: reducing number of samples
|
||||
# l_val_index = {l: train[:5] for l, train in l_val_index.items()}
|
||||
# l_val_target = {l: target[:5] for l, target in l_val_target.items()}
|
||||
|
||||
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
|
||||
lPad_index=self.multilingualIndex.l_pad())
|
||||
if stage == 'test' or stage is None:
|
||||
l_test_index, l_test_target = self.multilingualIndex.l_test()
|
||||
# Debug settings: reducing number of samples
|
||||
# l_test_index = {l: train[:5] for l, train in l_test_index.items()}
|
||||
# l_test_target = {l: target[:5] for l, target in l_test_target.items()}
|
||||
|
||||
self.test_dataset = RecurrentDataset(l_test_index, l_test_target,
|
||||
lPad_index=self.multilingualIndex.l_pad())
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
|
||||
collate_fn=self.training_dataset.collate_fn)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
|
||||
collate_fn=self.val_dataset.collate_fn)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
|
||||
collate_fn=self.test_dataset.collate_fn)
|
||||
|
||||
|
||||
def tokenize(l_raw, max_len):
|
||||
"""
|
||||
run Bert tokenization on dict {lang: list of samples}.
|
||||
:param l_raw:
|
||||
:param max_len:
|
||||
:return:
|
||||
"""
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
||||
l_tokenized = {}
|
||||
for lang in l_raw.keys():
|
||||
output_tokenizer = tokenizer(l_raw[lang], truncation=True, max_length=max_len, padding='max_length')
|
||||
l_tokenized[lang] = output_tokenizer['input_ids']
|
||||
return l_tokenized
|
||||
|
||||
|
||||
class BertDataModule(RecurrentDataModule):
|
||||
"""
|
||||
Pytorch Lightning Datamodule to be deployed with BertGen.
|
||||
https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
|
||||
"""
|
||||
def __init__(self, multilingualIndex, batchsize=64, max_len=512):
|
||||
"""
|
||||
Init BertDataModule.
|
||||
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
|
||||
indexed by language code.
|
||||
:param batchsize: int, number of sample per batch.
|
||||
:param max_len: int, max number of token per document. Absolute cap is 512.
|
||||
"""
|
||||
super().__init__(multilingualIndex, batchsize)
|
||||
self.max_len = max_len
|
||||
|
||||
def setup(self, stage=None):
|
||||
if stage == 'fit' or stage is None:
|
||||
l_train_raw, l_train_target = self.multilingualIndex.l_train_raw()
|
||||
# Debug settings: reducing number of samples
|
||||
# l_train_raw = {l: train[:5] for l, train in l_train_raw.items()}
|
||||
# l_train_target = {l: target[:5] for l, target in l_train_target.items()}
|
||||
|
||||
l_train_index = tokenize(l_train_raw, max_len=self.max_len)
|
||||
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
|
||||
lPad_index=self.multilingualIndex.l_pad())
|
||||
|
||||
l_val_raw, l_val_target = self.multilingualIndex.l_val_raw()
|
||||
# Debug settings: reducing number of samples
|
||||
# l_val_raw = {l: train[:5] for l, train in l_val_raw.items()}
|
||||
# l_val_target = {l: target[:5] for l, target in l_val_target.items()}
|
||||
|
||||
l_val_index = tokenize(l_val_raw, max_len=self.max_len)
|
||||
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
|
||||
lPad_index=self.multilingualIndex.l_pad())
|
||||
|
||||
if stage == 'test' or stage is None:
|
||||
l_test_raw, l_test_target = self.multilingualIndex.l_test_raw()
|
||||
# Debug settings: reducing number of samples
|
||||
# l_test_raw = {l: train[:5] for l, train in l_test_raw.items()}
|
||||
# l_test_target = {l: target[:5] for l, target in l_test_target.items()}
|
||||
|
||||
l_test_index = tokenize(l_test_raw, max_len=self.max_len)
|
||||
self.test_dataset = RecurrentDataset(l_test_index, l_test_target,
|
||||
lPad_index=self.multilingualIndex.l_pad())
|
||||
|
||||
def train_dataloader(self):
|
||||
"""
|
||||
NB: Setting n_workers to > 0 will cause "OSError: [Errno 24] Too many open files"
|
||||
:return:
|
||||
"""
|
||||
return DataLoader(self.training_dataset, batch_size=self.batchsize)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(self.val_dataset, batch_size=self.batchsize)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(self.test_dataset, batch_size=self.batchsize)
|
|
@ -1,19 +1,20 @@
|
|||
from os.path import join, exists
|
||||
from nltk.corpus import stopwords
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
from data.reader.jrcacquis_reader import *
|
||||
from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
|
||||
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy
|
||||
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
|
||||
import pickle
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from scipy.sparse import issparse
|
||||
import itertools
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
from os.path import exists
|
||||
|
||||
import numpy as np
|
||||
from nltk.corpus import stopwords
|
||||
from scipy.sparse import csr_matrix
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
from tqdm import tqdm
|
||||
|
||||
from src.data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
|
||||
from src.data.reader.jrcacquis_reader import *
|
||||
from src.data.reader.rcv_reader import fetch_RCV1, fetch_RCV2
|
||||
from src.data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
|
||||
|
||||
|
||||
class MultilingualDataset:
|
|
@ -1,19 +1,22 @@
|
|||
from __future__ import print_function
|
||||
import os, sys
|
||||
from os.path import join
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import sys
|
||||
import tarfile
|
||||
import xml.etree.ElementTree as ET
|
||||
from sklearn.datasets import get_data_home
|
||||
import pickle
|
||||
from util.file import download_file, list_dirs, list_files
|
||||
import zipfile
|
||||
from collections import Counter
|
||||
from os.path import join
|
||||
from random import shuffle
|
||||
|
||||
import rdflib
|
||||
from rdflib.namespace import RDF, SKOS
|
||||
from rdflib import URIRef
|
||||
import zipfile
|
||||
from data.languages import JRC_LANGS
|
||||
from collections import Counter
|
||||
from random import shuffle
|
||||
from data.languages import lang_set
|
||||
from sklearn.datasets import get_data_home
|
||||
|
||||
from src.data.languages import JRC_LANGS
|
||||
from src.data.languages import lang_set
|
||||
from src.util.file import download_file, list_dirs, list_files
|
||||
|
||||
"""
|
||||
JRC Acquis' Nomenclature:
|
||||
|
|
|
@ -1,15 +1,12 @@
|
|||
from zipfile import ZipFile
|
||||
import xml.etree.ElementTree as ET
|
||||
from data.languages import RCV2_LANGS_WITH_NLTK_STEMMING, RCV2_LANGS
|
||||
from util.file import list_files
|
||||
from sklearn.datasets import get_data_home
|
||||
import gzip
|
||||
from os.path import join, exists
|
||||
from util.file import download_file_if_not_exists
|
||||
import re
|
||||
from collections import Counter
|
||||
import xml.etree.ElementTree as ET
|
||||
from os.path import join, exists
|
||||
from zipfile import ZipFile
|
||||
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
from src.util.file import download_file_if_not_exists
|
||||
from src.util.file import list_files
|
||||
|
||||
"""
|
||||
RCV2's Nomenclature:
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
from __future__ import print_function
|
||||
|
||||
# import ijson
|
||||
# from ijson.common import ObjectBuilder
|
||||
import os, sys
|
||||
from os.path import join
|
||||
from bz2 import BZ2File
|
||||
import os
|
||||
import pickle
|
||||
from util.file import list_dirs, list_files, makedirs_if_not_exist
|
||||
from itertools import islice
|
||||
import re
|
||||
from bz2 import BZ2File
|
||||
from itertools import islice
|
||||
from os.path import join
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
import numpy as np
|
||||
from util.file import list_dirs, list_files
|
||||
|
||||
policies = ["IN_ALL_LANGS", "IN_ANY_LANG"]
|
||||
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
from nltk.corpus import stopwords
|
||||
from data.languages import NLTK_LANGMAP
|
||||
from nltk import word_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import SnowballStemmer
|
||||
|
||||
from src.data.languages import NLTK_LANGMAP
|
||||
|
||||
|
||||
def preprocess_documents(documents, lang):
|
||||
tokens = NLTKStemTokenizer(lang, verbose=True)
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
import math
|
||||
|
||||
import numpy as np
|
||||
from scipy.stats import t
|
||||
from joblib import Parallel, delayed
|
||||
from scipy.sparse import csr_matrix, csc_matrix
|
||||
from scipy.stats import t
|
||||
|
||||
|
||||
def get_probs(tpr, fpr, pc):
|
||||
|
|
|
@ -1,66 +0,0 @@
|
|||
import os
|
||||
from torchtext.vocab import Vectors
|
||||
import torch
|
||||
from abc import ABC, abstractmethod
|
||||
from util.SIF_embed import *
|
||||
|
||||
|
||||
class PretrainedEmbeddings(ABC):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@abstractmethod
|
||||
def vocabulary(self): pass
|
||||
|
||||
@abstractmethod
|
||||
def dim(self): pass
|
||||
|
||||
@classmethod
|
||||
def reindex(cls, words, word2index):
|
||||
if isinstance(words, dict):
|
||||
words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0]
|
||||
|
||||
source_idx, target_idx = [], []
|
||||
for i, word in enumerate(words):
|
||||
if word not in word2index: continue
|
||||
j = word2index[word]
|
||||
source_idx.append(i)
|
||||
target_idx.append(j)
|
||||
source_idx = np.asarray(source_idx)
|
||||
target_idx = np.asarray(target_idx)
|
||||
return source_idx, target_idx
|
||||
|
||||
|
||||
class FastTextWikiNews(Vectors):
|
||||
|
||||
url_base = 'Cant auto-download MUSE embeddings'
|
||||
path = '../embeddings/wiki.multi.{}.vec'
|
||||
_name = '/wiki.multi.{}.vec'
|
||||
|
||||
def __init__(self, cache, language="en", **kwargs):
|
||||
url = self.url_base.format(language)
|
||||
name = cache + self._name.format(language)
|
||||
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
||||
|
||||
|
||||
class FastTextMUSE(PretrainedEmbeddings):
|
||||
def __init__(self, path, lang, limit=None):
|
||||
super().__init__()
|
||||
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
|
||||
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
|
||||
|
||||
def vocabulary(self):
|
||||
return set(self.embed.stoi.keys())
|
||||
|
||||
def dim(self):
|
||||
return self.embed.dim
|
||||
|
||||
def extract(self, words):
|
||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
||||
extraction = torch.zeros((len(words), self.dim()))
|
||||
extraction[source_idx] = self.embed.vectors[target_idx]
|
||||
return extraction
|
||||
|
||||
|
||||
|
|
@ -1,102 +0,0 @@
|
|||
from abc import ABC, abstractmethod
|
||||
import torch, torchtext
|
||||
# import gensim
|
||||
# import os
|
||||
import numpy as np
|
||||
|
||||
|
||||
# class KeyedVectors:
|
||||
#
|
||||
# def __init__(self, word2index, weights):
|
||||
# assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
|
||||
# index2word = {i:w for w,i in word2index.items()}
|
||||
# assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
|
||||
# self.word2index = word2index
|
||||
# self.index2word = index2word
|
||||
# self.weights = weights
|
||||
#
|
||||
# def extract(self, words):
|
||||
# dim = self.weights.shape[1]
|
||||
# v_size = len(words)
|
||||
#
|
||||
# source_idx, target_idx = [], []
|
||||
# for i,word in enumerate(words):
|
||||
# if word not in self.word2index: continue
|
||||
# j = self.word2index[word]
|
||||
# source_idx.append(i)
|
||||
# target_idx.append(j)
|
||||
#
|
||||
# extraction = np.zeros((v_size, dim))
|
||||
# extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
|
||||
#
|
||||
# return extraction
|
||||
|
||||
|
||||
# class PretrainedEmbeddings(ABC):
|
||||
#
|
||||
# def __init__(self):
|
||||
# super().__init__()
|
||||
#
|
||||
# @abstractmethod
|
||||
# def vocabulary(self): pass
|
||||
#
|
||||
# @abstractmethod
|
||||
# def dim(self): pass
|
||||
#
|
||||
# @classmethod
|
||||
# def reindex(cls, words, word2index):
|
||||
# source_idx, target_idx = [], []
|
||||
# for i, word in enumerate(words):
|
||||
# if word not in word2index: continue
|
||||
# j = word2index[word]
|
||||
# source_idx.append(i)
|
||||
# target_idx.append(j)
|
||||
# source_idx = np.asarray(source_idx)
|
||||
# target_idx = np.asarray(target_idx)
|
||||
# return source_idx, target_idx
|
||||
|
||||
|
||||
# class GloVe(PretrainedEmbeddings):
|
||||
#
|
||||
# def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
|
||||
# super().__init__()
|
||||
# print(f'Loading GloVe pretrained vectors from torchtext')
|
||||
# self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
|
||||
# print('Done')
|
||||
#
|
||||
# def vocabulary(self):
|
||||
# return set(self.embed.stoi.keys())
|
||||
#
|
||||
# def dim(self):
|
||||
# return self.embed.dim
|
||||
#
|
||||
# def extract(self, words):
|
||||
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
||||
# extraction = torch.zeros((len(words), self.dim()))
|
||||
# extraction[source_idx] = self.embed.vectors[target_idx]
|
||||
# return extraction
|
||||
|
||||
|
||||
# class Word2Vec(PretrainedEmbeddings):
|
||||
#
|
||||
# def __init__(self, path, limit=None):
|
||||
# super().__init__()
|
||||
# print(f'Loading word2vec pretrained vectors from {path}')
|
||||
# assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
|
||||
# self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
|
||||
# self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
|
||||
# print('Done')
|
||||
#
|
||||
# def vocabulary(self):
|
||||
# return set(self.word2index.keys())
|
||||
#
|
||||
# def dim(self):
|
||||
# return self.embed.vector_size
|
||||
#
|
||||
# def extract(self, words):
|
||||
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
|
||||
# extraction = np.zeros((len(words), self.dim()))
|
||||
# extraction[source_idx] = self.embed.vectors[target_idx]
|
||||
# extraction = torch.from_numpy(extraction).float()
|
||||
# return extraction
|
||||
|
|
@ -1,74 +0,0 @@
|
|||
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
|
||||
import numpy as np
|
||||
|
||||
|
||||
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
||||
std = np.clip(np.std(x, ddof=1, axis=axis), 1e-5, None)
|
||||
mean = np.mean(x, axis=axis)
|
||||
return (x - mean) / std
|
||||
|
||||
|
||||
def supervised_embeddings_tfidf(X,Y):
|
||||
tfidf_norm = X.sum(axis=0)
|
||||
tfidf_norm[tfidf_norm==0] = 1
|
||||
F = (X.T).dot(Y) / tfidf_norm.T
|
||||
return F
|
||||
|
||||
|
||||
def supervised_embeddings_ppmi(X,Y):
|
||||
Xbin = X>0
|
||||
D = X.shape[0]
|
||||
Pxy = (Xbin.T).dot(Y)/D
|
||||
Px = Xbin.sum(axis=0)/D
|
||||
Py = Y.sum(axis=0)/D
|
||||
F = np.asarray(Pxy/(Px.T*Py))
|
||||
F = np.maximum(F, 1.0)
|
||||
F = np.log(F)
|
||||
return F
|
||||
|
||||
|
||||
def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=25000):
|
||||
D = X.shape[0]
|
||||
if D>max_documents:
|
||||
print(f'sampling {max_documents}')
|
||||
random_sample = np.random.permutation(D)[:max_documents]
|
||||
X = X[random_sample]
|
||||
Y = Y[random_sample]
|
||||
cell_matrix = get_supervised_matrix(X, Y)
|
||||
F = get_tsr_matrix(cell_matrix, tsr_score_funtion=tsr_function).T
|
||||
return F
|
||||
|
||||
|
||||
def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
|
||||
if max_label_space != 0:
|
||||
print('computing supervised embeddings...')
|
||||
nC = Y.shape[1]
|
||||
|
||||
if method=='ppmi':
|
||||
F = supervised_embeddings_ppmi(X, Y)
|
||||
elif method == 'dotn':
|
||||
F = supervised_embeddings_tfidf(X, Y)
|
||||
elif method == 'ig':
|
||||
F = supervised_embeddings_tsr(X, Y, information_gain)
|
||||
elif method == 'chi2':
|
||||
F = supervised_embeddings_tsr(X, Y, chi_square)
|
||||
|
||||
if dozscore:
|
||||
F = zscores(F, axis=0)
|
||||
|
||||
# Dumping F-matrix for further studies
|
||||
dump_it = False
|
||||
if dump_it:
|
||||
with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile:
|
||||
np.savetxt(outfile, F, delimiter='\t')
|
||||
with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile:
|
||||
for token in voc.keys():
|
||||
outfile.write(token+'\n')
|
||||
|
||||
return F
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
|
||||
logfile=../log/log10run_dl_jrc.csv
|
||||
|
||||
runs='0 1 2 3 4 5 6 7 8 9'
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
|
||||
done
|
|
@ -1,11 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||
logfile=../log/log10run_dl_rcv.csv
|
||||
|
||||
runs='0 1 2 3 4 5 6 7 8 9'
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
|
||||
done
|
|
@ -1,12 +0,0 @@
|
|||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
|
||||
logfile=./results/10run_jrc_final_results.csv
|
||||
|
||||
runs='0 1 2 3 4 5 6 7 8 9'
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
|
||||
python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
|
||||
python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
|
||||
|
||||
done
|
|
@ -1,16 +0,0 @@
|
|||
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
|
||||
logfile=./results/funnelling_10run_jrc_CIKM.csv
|
||||
|
||||
runs='6 7 8 9' #0 1 2 3 4 5
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated (done up to run5)
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -c -P -U -r -z --l2
|
||||
#python main_gFun.py $dataset -o $logfile -c -P -U -S -r -z --l2
|
||||
done
|
|
@ -1,15 +0,0 @@
|
|||
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||
logfile=./results/10run_rcv_final_results.csv
|
||||
|
||||
runs='0 1 2 3 4 5 6 7 8 9'
|
||||
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
|
||||
python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
|
||||
python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
|
||||
|
||||
done
|
||||
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||
logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv
|
||||
|
||||
runs='0 1 2 3 4 5 6 7 8 9'
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -c -P -U -r -z --l2
|
||||
#python main_gFun.py $dataset -o $logfile -c -P -U -S -r -z --l2
|
||||
done
|
|
@ -1,14 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run#
|
||||
|
||||
runs='1 2 3 4 5 6 7 8 9'
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
modelpath=/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run$runs
|
||||
python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath
|
||||
done
|
||||
|
||||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
|
||||
python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath
|
|
@ -1,329 +0,0 @@
|
|||
import argparse
|
||||
import torch.nn as nn
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
from dataset_builder import MultilingualDataset
|
||||
from learning.transformers import load_muse_embeddings
|
||||
from models.lstm_class import RNNMultilingualClassifier
|
||||
from util.csv_log import CSVLog
|
||||
from util.early_stop import EarlyStopping
|
||||
from util.common import *
|
||||
from util.file import create_if_not_exist
|
||||
from time import time
|
||||
from tqdm import tqdm
|
||||
from util.evaluation import evaluate
|
||||
from util.file import get_file_name
|
||||
# import pickle
|
||||
|
||||
allowed_nets = {'rnn'}
|
||||
|
||||
# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested
|
||||
def init_Net(nC, multilingual_index, xavier_uniform=True):
|
||||
net=opt.net
|
||||
assert net in allowed_nets, f'{net} not supported, valid ones are={allowed_nets}'
|
||||
|
||||
# instantiate the required net
|
||||
if net=='rnn':
|
||||
only_post = opt.posteriors and (not opt.pretrained) and (not opt.supervised)
|
||||
if only_post:
|
||||
print('working on ONLY POST mode')
|
||||
model = RNNMultilingualClassifier(
|
||||
output_size=nC,
|
||||
hidden_size=opt.hidden,
|
||||
lvocab_size=multilingual_index.l_vocabsize(),
|
||||
learnable_length=opt.learnable,
|
||||
lpretrained=multilingual_index.l_embeddings(),
|
||||
drop_embedding_range=multilingual_index.sup_range,
|
||||
drop_embedding_prop=opt.sup_drop,
|
||||
post_probabilities=opt.posteriors,
|
||||
only_post=only_post,
|
||||
bert_embeddings=opt.mbert
|
||||
)
|
||||
|
||||
# weight initialization
|
||||
if xavier_uniform:
|
||||
for p in model.parameters():
|
||||
if p.dim() > 1 and p.requires_grad:
|
||||
nn.init.xavier_uniform_(p)
|
||||
|
||||
if opt.tunable:
|
||||
# this has to be performed *after* Xavier initialization is done,
|
||||
# otherwise the pretrained embedding parameters will be overrided
|
||||
model.finetune_pretrained()
|
||||
|
||||
return model.cuda()
|
||||
|
||||
|
||||
def set_method_name():
|
||||
method_name = f'{opt.net}(H{opt.hidden})'
|
||||
if opt.pretrained:
|
||||
method_name += f'-Muse'
|
||||
if opt.supervised:
|
||||
method_name += f'-WCE'
|
||||
if opt.posteriors:
|
||||
method_name += f'-Posteriors'
|
||||
if opt.mbert:
|
||||
method_name += f'-mBert'
|
||||
if (opt.pretrained or opt.supervised) and opt.tunable:
|
||||
method_name += '-(trainable)'
|
||||
else:
|
||||
method_name += '-(static)'
|
||||
if opt.learnable > 0:
|
||||
method_name += f'-Learnable{opt.learnable}'
|
||||
return method_name
|
||||
|
||||
|
||||
def init_optimizer(model, lr):
|
||||
return torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=opt.weight_decay)
|
||||
|
||||
|
||||
def init_logfile(method_name, opt):
|
||||
logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
|
||||
logfile.set_default('dataset', opt.dataset)
|
||||
logfile.set_default('run', opt.seed)
|
||||
logfile.set_default('method', method_name)
|
||||
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
|
||||
f'and run {opt.seed} already calculated'
|
||||
return logfile
|
||||
|
||||
|
||||
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
|
||||
def load_pretrained_embeddings(we_path, langs):
|
||||
lpretrained = lpretrained_vocabulary = none_dict(langs)
|
||||
if opt.pretrained:
|
||||
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
|
||||
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
|
||||
return lpretrained, lpretrained_vocabulary
|
||||
|
||||
|
||||
def get_lr(optimizer):
|
||||
for param_group in optimizer.param_groups:
|
||||
return param_group['lr']
|
||||
|
||||
|
||||
def train(model, batcher, ltrain_index, ltrain_posteriors, ltrain_bert, lytr, tinit, logfile, criterion, optim, epoch, method_name):
|
||||
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
loss_history = []
|
||||
model.train()
|
||||
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
|
||||
optim.zero_grad()
|
||||
# _out = model(batch, post, bert_emb, lang)
|
||||
loss = criterion(model(batch, post, bert_emb, lang), target)
|
||||
loss.backward()
|
||||
clip_gradient(model)
|
||||
optim.step()
|
||||
loss_history.append(loss.item())
|
||||
|
||||
if idx % opt.log_interval == 0:
|
||||
interval_loss = np.mean(loss_history[-opt.log_interval:])
|
||||
print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
|
||||
|
||||
mean_loss = np.mean(interval_loss)
|
||||
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
|
||||
return mean_loss
|
||||
|
||||
|
||||
def test(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix):
|
||||
|
||||
loss_history = []
|
||||
model.eval()
|
||||
langs = sorted(ltest_index.keys())
|
||||
predictions = {l:[] for l in langs}
|
||||
yte_stacked = {l:[] for l in langs}
|
||||
batcher.init_offset()
|
||||
for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), desc='evaluation: '):
|
||||
logits = model(batch, post, bert_emb, lang)
|
||||
loss = criterion(logits, target).item()
|
||||
prediction = predict(logits)
|
||||
predictions[lang].append(prediction)
|
||||
yte_stacked[lang].append(target.detach().cpu().numpy())
|
||||
loss_history.append(loss)
|
||||
|
||||
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
|
||||
ly_ = {l:np.vstack(predictions[l]) for l in langs}
|
||||
l_eval = evaluate(ly, ly_)
|
||||
metrics = []
|
||||
for lang in langs:
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
if measure_prefix == 'te':
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
||||
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
||||
|
||||
mean_loss = np.mean(loss_history)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
|
||||
|
||||
return Mf1
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def main():
|
||||
DEBUGGING = False
|
||||
|
||||
method_name = set_method_name()
|
||||
logfile = init_logfile(method_name, opt)
|
||||
|
||||
# Loading the dataset
|
||||
data = MultilingualDataset.load(opt.dataset)
|
||||
# data.set_view(languages=['it', 'fr']) # Testing with less langs
|
||||
data.show_dimensions()
|
||||
langs = data.langs()
|
||||
l_devel_raw, l_devel_target = data.training(target_as_csr=True)
|
||||
l_test_raw, l_test_target = data.test(target_as_csr=True)
|
||||
|
||||
# Loading the MUSE pretrained embeddings (only if requested)
|
||||
lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
|
||||
# lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set
|
||||
|
||||
# Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
|
||||
multilingual_index = MultilingualIndex()
|
||||
multilingual_index.index(l_devel_raw, l_devel_target, l_test_raw, lpretrained_vocabulary)
|
||||
multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
|
||||
multilingual_index.embedding_matrices(lpretrained, opt.supervised)
|
||||
if opt.posteriors:
|
||||
if DEBUGGING:
|
||||
import pickle
|
||||
with open('/home/andreapdr/funneling_pdr/dumps/posteriors_jrc_run0.pickle', 'rb') as infile:
|
||||
data_post = pickle.load(infile)
|
||||
lPtr = data_post[0]
|
||||
lPva = data_post[1]
|
||||
lPte = data_post[2]
|
||||
print('## DEBUGGING MODE: loaded dumped posteriors for jrc run0')
|
||||
else:
|
||||
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000)
|
||||
else:
|
||||
lPtr, lPva, lPte = None, None, None
|
||||
|
||||
if opt.mbert:
|
||||
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
_model_folder = _dataset_path[0] + '_' + _dataset_path[-1].replace('.pickle', '')
|
||||
# print(f'Model Folder: {_model_folder}')
|
||||
|
||||
if DEBUGGING:
|
||||
with open('/home/andreapdr/funneling_pdr/dumps/mBert_jrc_run0.pickle', 'rb') as infile:
|
||||
data_embed = pickle.load(infile)
|
||||
tr_bert_embeddings = data_embed[0]
|
||||
va_bert_embeddings = data_embed[1]
|
||||
te_bert_embeddings = data_embed[2]
|
||||
print('## DEBUGGING MODE: loaded dumped mBert embeddings for jrc run0')
|
||||
else:
|
||||
tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings \
|
||||
= multilingual_index.bert_embeddings(f'/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-{_model_folder}/')
|
||||
else:
|
||||
tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings = None, None, None
|
||||
|
||||
# Model initialization
|
||||
model = init_Net(data.num_categories(), multilingual_index)
|
||||
|
||||
optim = init_optimizer(model, lr=opt.lr)
|
||||
criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||
lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
|
||||
batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
|
||||
batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
|
||||
|
||||
tinit = time()
|
||||
create_if_not_exist(opt.checkpoint_dir)
|
||||
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
|
||||
checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
|
||||
|
||||
l_train_index, l_train_target = multilingual_index.l_train()
|
||||
l_val_index, l_val_target = multilingual_index.l_val()
|
||||
l_test_index = multilingual_index.l_test_index()
|
||||
|
||||
print('-'*80)
|
||||
print('Start training')
|
||||
for epoch in range(1, opt.nepochs + 1):
|
||||
train(model, batcher_train, l_train_index, lPtr, tr_bert_embeddings, l_train_target, tinit, logfile, criterion, optim, epoch, method_name)
|
||||
lr_scheduler.step() # reduces the learning rate
|
||||
|
||||
# validation
|
||||
macrof1 = test(model, batcher_eval, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, epoch, logfile, criterion, 'va')
|
||||
early_stop(macrof1, epoch)
|
||||
if opt.test_each>0:
|
||||
if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
|
||||
test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
|
||||
|
||||
if early_stop.STOP:
|
||||
print('[early-stop] STOP')
|
||||
if not opt.plotmode: # with plotmode activated, early-stop is ignored
|
||||
break
|
||||
|
||||
# training is over
|
||||
# restores the best model according to the Mf1 of the validation set (only when plotmode==False)
|
||||
# stoptime = early_stop.stop_time - tinit
|
||||
# stopepoch = early_stop.best_epoch
|
||||
# logfile.add_row(epoch=stopepoch, measure=f'early-stop', value=early_stop.best_score, timelapse=stoptime)
|
||||
|
||||
if opt.plotmode==False:
|
||||
print('-' * 80)
|
||||
print('Training over. Performing final evaluation')
|
||||
|
||||
# torch.cuda.empty_cache()
|
||||
model = early_stop.restore_checkpoint()
|
||||
|
||||
if opt.val_epochs>0:
|
||||
print(f'running last {opt.val_epochs} training epochs on the validation set')
|
||||
for val_epoch in range(1, opt.val_epochs + 1):
|
||||
batcher_train.init_offset()
|
||||
train(model, batcher_train, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name)
|
||||
|
||||
# final test
|
||||
print('Training complete: testing')
|
||||
test(model, batcher_eval, l_test_index, lPte, te_bert_embeddings, l_test_target, tinit, epoch, logfile, criterion, 'te')
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings')
|
||||
parser.add_argument('dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset')
|
||||
parser.add_argument('--batch-size', type=int, default=50, metavar='int', help='input batch size (default: 100)')
|
||||
parser.add_argument('--batch-size-test', type=int, default=250, metavar='int', help='batch size for testing (default: 250)')
|
||||
parser.add_argument('--nepochs', type=int, default=200, metavar='int', help='number of epochs (default: 200)')
|
||||
parser.add_argument('--patience', type=int, default=10, metavar='int', help='patience for early-stop (default: 10)')
|
||||
parser.add_argument('--plotmode', action='store_true', default=False, help='in plot mode executes a long run in order '
|
||||
'to generate enough data to produce trend plots (test-each should be >0. This mode is '
|
||||
'used to produce plots, and does not perform an evaluation on the test set.')
|
||||
parser.add_argument('--hidden', type=int, default=512, metavar='int', help='hidden lstm size (default: 512)')
|
||||
parser.add_argument('--lr', type=float, default=1e-3, metavar='float', help='learning rate (default: 1e-3)')
|
||||
parser.add_argument('--weight_decay', type=float, default=0, metavar='float', help='weight decay (default: 0)')
|
||||
parser.add_argument('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', help='dropout probability for the supervised matrix (default: 0.5)')
|
||||
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
|
||||
parser.add_argument('--svm-max-docs', type=int, default=1000, metavar='int', help='maximum number of documents by '
|
||||
'language used to train the calibrated SVMs (only used if --posteriors is active)')
|
||||
parser.add_argument('--log-interval', type=int, default=10, metavar='int', help='how many batches to wait before printing training status')
|
||||
parser.add_argument('--log-file', type=str, default='../log/log.csv', metavar='str', help='path to the log csv file')
|
||||
parser.add_argument('--test-each', type=int, default=0, metavar='int', help='how many epochs to wait before invoking test (default: 0, only at the end)')
|
||||
parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', help='path to the directory containing checkpoints')
|
||||
parser.add_argument('--net', type=str, default='rnn', metavar='str', help=f'net, one in {allowed_nets}')
|
||||
parser.add_argument('--pretrained', action='store_true', default=False, help='use MUSE pretrained embeddings')
|
||||
parser.add_argument('--supervised', action='store_true', default=False, help='use supervised embeddings')
|
||||
parser.add_argument('--posteriors', action='store_true', default=False, help='concatenate posterior probabilities to doc embeddings')
|
||||
parser.add_argument('--learnable', type=int, default=0, metavar='int', help='dimension of the learnable embeddings (default 0)')
|
||||
parser.add_argument('--val-epochs', type=int, default=1, metavar='int', help='number of training epochs to perform on the '
|
||||
'validation set once training is over (default 1)')
|
||||
parser.add_argument('--we-path', type=str, default='../embeddings', metavar='str',
|
||||
help=f'path to MUSE pretrained embeddings')
|
||||
parser.add_argument('--max-label-space', type=int, default=300, metavar='int', help='larger dimension allowed for the '
|
||||
'feature-label embedding (if larger, then PCA with this number of components is applied '
|
||||
'(default 300)')
|
||||
parser.add_argument('--force', action='store_true', default=False, help='do not check if this experiment has already been run')
|
||||
parser.add_argument('--tunable', action='store_true', default=False,
|
||||
help='pretrained embeddings are tunable from the beginning (default False, i.e., static)')
|
||||
parser.add_argument('--mbert', action='store_true', default=False,
|
||||
help='use mBert embeddings')
|
||||
|
||||
opt = parser.parse_args()
|
||||
|
||||
assert torch.cuda.is_available(), 'CUDA not available'
|
||||
assert not opt.plotmode or opt.test_each > 0, 'plot mode implies --test-each>0'
|
||||
# if opt.pickle_dir: opt.pickle_path = join(opt.pickle_dir, f'{opt.dataset}.pickle')
|
||||
torch.manual_seed(opt.seed)
|
||||
|
||||
main()
|
|
@ -1,127 +0,0 @@
|
|||
import os
|
||||
from dataset_builder import MultilingualDataset
|
||||
from util.evaluation import *
|
||||
from optparse import OptionParser
|
||||
from util.file import exists
|
||||
from util.results import PolylingualClassificationResults
|
||||
from util.util import get_learner, get_params
|
||||
|
||||
parser = OptionParser()
|
||||
|
||||
parser.add_option("-d", "--dataset", dest="dataset",
|
||||
help="Path to the multilingual dataset processed and stored in .pickle format",
|
||||
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='./results/results.csv')
|
||||
|
||||
parser.add_option("-e", "--mode-embed", dest="mode_embed",
|
||||
help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
|
||||
|
||||
parser.add_option("-w", "--we-path", dest="we_path",
|
||||
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
|
||||
|
||||
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
|
||||
default='MUSE')
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
||||
help="Optimize hyperparameters", default=False)
|
||||
|
||||
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
||||
help="Number of parallel jobs (default is -1, all)", default=-1)
|
||||
|
||||
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
||||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. "
|
||||
"If set to 0 it will automatically search for the best number of components. "
|
||||
"If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)",
|
||||
default=300)
|
||||
|
||||
parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
||||
help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
|
||||
" If set to 0 it will automatically search for the best number of components", default=300)
|
||||
|
||||
parser.add_option("-l", dest="lang", type=str)
|
||||
|
||||
if __name__ == '__main__':
|
||||
(op, args) = parser.parse_args()
|
||||
|
||||
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
|
||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||
|
||||
dataset_file = os.path.basename(op.dataset)
|
||||
|
||||
results = PolylingualClassificationResults('./results/PLE_results.csv')
|
||||
|
||||
data = MultilingualDataset.load(op.dataset)
|
||||
data.show_dimensions()
|
||||
|
||||
# data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
|
||||
# data.set_view(languages=[op.lang])
|
||||
# data.set_view(categories=list(range(10)))
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
if op.set_c != -1:
|
||||
meta_parameters = None
|
||||
else:
|
||||
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
|
||||
|
||||
# Embeddings and WCE config
|
||||
_available_mode = ['none', 'unsupervised', 'supervised', 'both']
|
||||
_available_type = ['MUSE', 'FastText']
|
||||
assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}'
|
||||
assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}'
|
||||
|
||||
if op.mode_embed == 'none':
|
||||
config = {'unsupervised': False,
|
||||
'supervised': False,
|
||||
'we_type': None}
|
||||
_config_id = 'None'
|
||||
elif op.mode_embed == 'unsupervised':
|
||||
config = {'unsupervised': True,
|
||||
'supervised': False,
|
||||
'we_type': op.we_type}
|
||||
_config_id = 'M'
|
||||
elif op.mode_embed == 'supervised':
|
||||
config = {'unsupervised': False,
|
||||
'supervised': True,
|
||||
'we_type': None}
|
||||
_config_id = 'F'
|
||||
elif op.mode_embed == 'both':
|
||||
config = {'unsupervised': True,
|
||||
'supervised': True,
|
||||
'we_type': op.we_type}
|
||||
_config_id = 'M+F'
|
||||
|
||||
config['reduction'] = 'PCA'
|
||||
config['max_label_space'] = op.max_labels_S
|
||||
config['dim_reduction_unsupervised'] = op.max_labels_U
|
||||
# config['post_pca'] = op.post_pca
|
||||
# config['plot_covariance_matrices'] = True
|
||||
|
||||
result_id = dataset_file + 'MLE_andrea' + _config_id + ('_optimC' if op.optimc else '')
|
||||
|
||||
ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/',
|
||||
config = config,
|
||||
learner=get_learner(calibrate=False),
|
||||
c_parameters=get_params(dense=False),
|
||||
n_jobs=op.n_jobs)
|
||||
|
||||
print('# Fitting ...')
|
||||
ple.fit(lXtr, lytr)
|
||||
|
||||
print('# Evaluating ...')
|
||||
ple_eval = evaluate_method(ple, lXte, lyte)
|
||||
|
||||
metrics = []
|
||||
for lang in lXte.keys():
|
||||
macrof1, microf1, macrok, microk = ple_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||
results.add_row('MLE', 'svm', _config_id, config['we_type'],
|
||||
'no','no', op.optimc, op.dataset.split('/')[-1], ple.time,
|
||||
lang, macrof1, microf1, macrok, microk, '')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
|
@ -1,155 +0,0 @@
|
|||
import os
|
||||
from dataset_builder import MultilingualDataset
|
||||
# from learning.learners import *
|
||||
# from learning.learners import FunnellingMultimodal
|
||||
from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting
|
||||
from util.evaluation import *
|
||||
from optparse import OptionParser
|
||||
from util.file import exists
|
||||
from util.results import PolylingualClassificationResults
|
||||
from sklearn.svm import SVC
|
||||
|
||||
parser = OptionParser()
|
||||
|
||||
# parser.add_option("-d", "--dataset", dest="dataset",
|
||||
# help="Path to the multilingual dataset processed and stored in .pickle format",
|
||||
# default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='./results/results.csv')
|
||||
|
||||
parser.add_option("-P", "--probs", dest="posteriors", action='store_true',
|
||||
help="Add posterior probabilities to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
|
||||
help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
|
||||
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-w", "--we-path", dest="we_path",
|
||||
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
||||
help="Optimize hyperparameters", default=False)
|
||||
|
||||
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
||||
help="Number of parallel jobs (default is -1, all)", default=-1)
|
||||
|
||||
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
||||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
||||
default=300)
|
||||
|
||||
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
|
||||
help="Remove common component when computing dot product of word embedding matrices", default=False)
|
||||
|
||||
# parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
||||
# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
|
||||
# " If set to 0 it will automatically search for the best number of components", default=300)
|
||||
|
||||
# parser.add_option("-a", dest="post_pca",
|
||||
# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
|
||||
# "embedding space", default=False)
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
|
||||
|
||||
|
||||
def get_params(dense=False):
|
||||
if not op.optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = 'rbf' if dense else 'linear'
|
||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
||||
#######################################################################################################################
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
(op, args) = parser.parse_args()
|
||||
|
||||
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
|
||||
dataset = args[0]
|
||||
|
||||
assert exists(dataset), 'Unable to find file '+str(dataset)
|
||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
|
||||
|
||||
dataset_file = os.path.basename(dataset)
|
||||
|
||||
results = PolylingualClassificationResults(op.output)
|
||||
|
||||
data = MultilingualDataset.load(dataset)
|
||||
data.show_dimensions()
|
||||
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||
|
||||
# result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
|
||||
result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \
|
||||
f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}'
|
||||
print(f'{result_id}')
|
||||
|
||||
# text preprocessing
|
||||
tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
|
||||
lXtr = tfidfvectorizer.fit_transform(lXtr, lytr)
|
||||
lXte = tfidfvectorizer.transform(lXte)
|
||||
lV = tfidfvectorizer.vocabulary()
|
||||
|
||||
classifiers = []
|
||||
if op.posteriors:
|
||||
classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
|
||||
if op.supervised:
|
||||
classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S)))
|
||||
if op.pretrained:
|
||||
classifiers.append(FeatureSet2Posteriors(MuseEmbedder(op.we_path, lV=lV)))
|
||||
|
||||
classifier = Voting(*classifiers)
|
||||
|
||||
print('# Fitting ...')
|
||||
classifier.fit(lXtr, lytr)
|
||||
|
||||
print('\n# Evaluating ...')
|
||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||
|
||||
# renaming arguments to be printed on log
|
||||
_id = ''
|
||||
_id_conf = [op.posteriors, op.supervised, op.pretrained]
|
||||
_id_name = ['+P', '+W', '+M']
|
||||
for i, conf in enumerate(_id_conf):
|
||||
if conf:
|
||||
_id += _id_name[i]
|
||||
_id = _id.lstrip('+')
|
||||
_dataset_path = dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
metrics = []
|
||||
for lang in lXte.keys():
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
results.add_row(method='Voting',
|
||||
learner='svm',
|
||||
optimp=op.optimc,
|
||||
sif=op.sif,
|
||||
zscore='todo',
|
||||
l2='todo',
|
||||
wescaler='todo',
|
||||
pca=op.max_labels_S,
|
||||
id=_id,
|
||||
dataset=dataset_id,
|
||||
time='todo',
|
||||
lang=lang,
|
||||
macrof1=macrof1,
|
||||
microf1=microf1,
|
||||
macrok=macrok,
|
||||
microk=microk,
|
||||
notes='')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
|
@ -1,390 +0,0 @@
|
|||
from dataset_builder import MultilingualDataset
|
||||
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
import numpy as np
|
||||
import torch
|
||||
from util.common import predict
|
||||
from time import time
|
||||
from util.csv_log import CSVLog
|
||||
from util.evaluation import evaluate
|
||||
from util.early_stop import EarlyStopping
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
from sklearn.model_selection import train_test_split
|
||||
from copy import deepcopy
|
||||
import argparse
|
||||
# from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
|
||||
def check_sentences(sentences):
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
||||
for sentence in sentences:
|
||||
converted = [tokenizer._convert_id_to_token(token) for token in sentence.numpy() if token != 0]
|
||||
print(converted)
|
||||
return
|
||||
|
||||
|
||||
def get_model(n_out):
|
||||
print('# Initializing model ...')
|
||||
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
|
||||
return model
|
||||
|
||||
|
||||
def set_method_name():
|
||||
return 'mBERT'
|
||||
|
||||
|
||||
def init_optimizer(model, lr):
|
||||
# return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay)
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in model.named_parameters()
|
||||
if not any(nd in n for nd in no_decay)],
|
||||
'weight_decay': opt.weight_decay},
|
||||
{'params': [p for n, p in model.named_parameters()
|
||||
if any(nd in n for nd in no_decay)],
|
||||
'weight_decay': opt.weight_decay}
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
|
||||
return optimizer
|
||||
|
||||
|
||||
def init_logfile(method_name, opt):
|
||||
logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
|
||||
logfile.set_default('dataset', opt.dataset)
|
||||
logfile.set_default('run', opt.seed)
|
||||
logfile.set_default('method', method_name)
|
||||
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
|
||||
f'and run {opt.seed} already calculated'
|
||||
return logfile
|
||||
|
||||
|
||||
def get_lr(optimizer):
|
||||
for param_group in optimizer.param_groups:
|
||||
return param_group['lr']
|
||||
|
||||
|
||||
def get_dataset_name(datapath):
|
||||
possible_splits = [str(i) for i in range(10)]
|
||||
splitted = datapath.split('_')
|
||||
id_split = splitted[-1].split('.')[0][-1]
|
||||
if id_split in possible_splits:
|
||||
dataset_name = splitted[0].split('/')[-1]
|
||||
return f'{dataset_name}_run{id_split}'
|
||||
elif splitted[-2].split('.')[0] == 'full':
|
||||
dataset_name = splitted[0].split('/')[-1]
|
||||
return f'{dataset_name}_fullrun'
|
||||
|
||||
|
||||
def load_datasets(datapath):
|
||||
data = MultilingualDataset.load(datapath)
|
||||
# data.set_view(languages=['it']) #, categories=[0, 1, 2, 3, 4]) # Testing with less langs
|
||||
data.show_dimensions()
|
||||
|
||||
l_devel_raw, l_devel_target = data.training(target_as_csr=False)
|
||||
l_test_raw, l_test_target = data.test(target_as_csr=False)
|
||||
|
||||
return l_devel_raw, l_devel_target, l_test_raw, l_test_target
|
||||
|
||||
|
||||
def do_tokenization(l_dataset, max_len=512, verbose=True):
|
||||
if verbose:
|
||||
print('# Starting Tokenization ...')
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
||||
langs = l_dataset.keys()
|
||||
l_tokenized = {}
|
||||
for lang in langs:
|
||||
l_tokenized[lang] = tokenizer(l_dataset[lang],
|
||||
truncation=True,
|
||||
max_length=max_len,
|
||||
padding='max_length')
|
||||
return l_tokenized
|
||||
|
||||
|
||||
class TrainingDataset(Dataset):
|
||||
"""
|
||||
data: dict of lang specific tokenized data
|
||||
labels: dict of lang specific targets
|
||||
"""
|
||||
|
||||
def __init__(self, data, labels):
|
||||
self.langs = data.keys()
|
||||
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
|
||||
|
||||
for i, lang in enumerate(self.langs):
|
||||
_data = data[lang]['input_ids']
|
||||
_data = np.array(_data)
|
||||
_labels = labels[lang]
|
||||
_lang_value = np.full(len(_data), self.lang_ids[lang])
|
||||
|
||||
if i == 0:
|
||||
self.data = _data
|
||||
self.labels = _labels
|
||||
self.lang_index = _lang_value
|
||||
else:
|
||||
self.data = np.vstack((self.data, _data))
|
||||
self.labels = np.vstack((self.labels, _labels))
|
||||
self.lang_index = np.concatenate((self.lang_index, _lang_value))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
x = self.data[idx]
|
||||
y = self.labels[idx]
|
||||
lang = self.lang_index[idx]
|
||||
|
||||
return x, torch.tensor(y, dtype=torch.float), lang
|
||||
|
||||
def get_lang_ids(self):
|
||||
return self.lang_ids
|
||||
|
||||
def get_nclasses(self):
|
||||
if hasattr(self, 'labels'):
|
||||
return len(self.labels[0])
|
||||
else:
|
||||
print('Method called before init!')
|
||||
|
||||
|
||||
def freeze_encoder(model):
|
||||
for param in model.base_model.parameters():
|
||||
param.requires_grad = False
|
||||
return model
|
||||
|
||||
|
||||
def check_param_grad_status(model):
|
||||
print('#' * 50)
|
||||
print('Model paramater status:')
|
||||
for name, child in model.named_children():
|
||||
trainable = False
|
||||
for param in child.parameters():
|
||||
if param.requires_grad:
|
||||
trainable = True
|
||||
if not trainable:
|
||||
print(f'{name} is frozen')
|
||||
else:
|
||||
print(f'{name} is not frozen')
|
||||
print('#' * 50)
|
||||
|
||||
|
||||
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer):
|
||||
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
loss_history = []
|
||||
model.train()
|
||||
|
||||
for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
|
||||
optim.zero_grad()
|
||||
out = model(batch.cuda())
|
||||
logits = out[0]
|
||||
loss = criterion(logits, target.cuda())
|
||||
loss.backward()
|
||||
# clip_gradient(model)
|
||||
optim.step()
|
||||
loss_history.append(loss.item())
|
||||
|
||||
if writer is not None:
|
||||
_n_step = (epoch - 1) * (len(train_dataloader)) + idx
|
||||
writer.add_scalar('Loss_step/Train', loss, _n_step)
|
||||
|
||||
# Check tokenized sentences consistency
|
||||
# check_sentences(batch.cpu())
|
||||
|
||||
if idx % opt.log_interval == 0:
|
||||
interval_loss = np.mean(loss_history[-opt.log_interval:])
|
||||
print(
|
||||
f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
|
||||
|
||||
mean_loss = np.mean(interval_loss)
|
||||
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
|
||||
return mean_loss
|
||||
|
||||
|
||||
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix, writer):
|
||||
print('# Validating model ...')
|
||||
loss_history = []
|
||||
model.eval()
|
||||
langs = lang_ids.keys()
|
||||
id_2_lang = {v: k for k, v in lang_ids.items()}
|
||||
predictions = {l: [] for l in langs}
|
||||
yte_stacked = {l: [] for l in langs}
|
||||
|
||||
for batch, target, lang_idx in test_dataloader:
|
||||
out = model(batch.cuda())
|
||||
logits = out[0]
|
||||
loss = criterion(logits, target.cuda()).item()
|
||||
prediction = predict(logits)
|
||||
loss_history.append(loss)
|
||||
|
||||
# Assigning prediction to dict in predictions and yte_stacked according to lang_idx
|
||||
for i, pred in enumerate(prediction):
|
||||
lang_pred = id_2_lang[lang_idx.numpy()[i]]
|
||||
predictions[lang_pred].append(pred)
|
||||
yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
|
||||
|
||||
ly = {l: np.vstack(yte_stacked[l]) for l in langs}
|
||||
ly_ = {l: np.vstack(predictions[l]) for l in langs}
|
||||
l_eval = evaluate(ly, ly_)
|
||||
metrics = []
|
||||
for lang in langs:
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
if measure_prefix == 'te':
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
||||
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
||||
if writer is not None:
|
||||
writer.add_scalars('Eval Metrics', {'Mf1': Mf1, 'mF1': mF1, 'MK': MK, 'mk':mk}, epoch)
|
||||
|
||||
mean_loss = np.mean(loss_history)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
|
||||
|
||||
return Mf1
|
||||
|
||||
|
||||
def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
|
||||
l_split_va = deepcopy(l_tokenized_tr)
|
||||
l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
|
||||
l_split_tr = deepcopy(l_tokenized_tr)
|
||||
l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
|
||||
|
||||
for lang in l_tokenized_tr.keys():
|
||||
val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
|
||||
l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[
|
||||
lang] = \
|
||||
train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size,
|
||||
random_state=seed, shuffle=True)
|
||||
|
||||
return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
|
||||
|
||||
|
||||
def main():
|
||||
print('Running main ...')
|
||||
|
||||
DATAPATH = opt.dataset
|
||||
MAX_LEN = 512
|
||||
method_name = set_method_name()
|
||||
logfile = init_logfile(method_name, opt)
|
||||
|
||||
l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
|
||||
l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN)
|
||||
|
||||
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target,
|
||||
val_prop=0.2, max_val=2000,
|
||||
seed=opt.seed)
|
||||
|
||||
l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN)
|
||||
|
||||
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
|
||||
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
|
||||
te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
|
||||
|
||||
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
|
||||
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
|
||||
te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False)
|
||||
|
||||
|
||||
# Initializing model
|
||||
nC = tr_dataset.get_nclasses()
|
||||
model = get_model(nC)
|
||||
model = model.cuda()
|
||||
criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||
optim = init_optimizer(model, lr=opt.lr)
|
||||
lr_scheduler = StepLR(optim, step_size=25, gamma=0.1)
|
||||
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
|
||||
checkpoint=f'/home/andreapdr/funneling_pdr/hug_checkpoint/{method_name}-{get_dataset_name(opt.dataset)}',
|
||||
is_bert=True)
|
||||
|
||||
# Freezing encoder
|
||||
# model = freeze_encoder(model)
|
||||
check_param_grad_status(model)
|
||||
|
||||
# Tensorboard logger
|
||||
# writer = SummaryWriter('../log/tensorboard_logs/')
|
||||
|
||||
# Training loop
|
||||
tinit = time()
|
||||
lang_ids = va_dataset.lang_ids
|
||||
for epoch in range(1, opt.nepochs + 1):
|
||||
print('# Start Training ...')
|
||||
train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer=None)
|
||||
lr_scheduler.step() # reduces the learning rate
|
||||
|
||||
# Validation
|
||||
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va', writer=None)
|
||||
early_stop(macrof1, epoch)
|
||||
if opt.test_each > 0:
|
||||
if (opt.plotmode and (epoch == 1 or epoch % opt.test_each == 0)) or (
|
||||
not opt.plotmode and epoch % opt.test_each == 0 and epoch < opt.nepochs):
|
||||
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None)
|
||||
|
||||
if early_stop.STOP:
|
||||
print('[early-stop] STOP')
|
||||
if not opt.plotmode:
|
||||
break
|
||||
|
||||
if not opt.plotmode:
|
||||
print('-' * 80)
|
||||
print('Training over. Performing final evaluation')
|
||||
|
||||
model = early_stop.restore_checkpoint()
|
||||
model = model.cuda()
|
||||
|
||||
if opt.val_epochs > 0:
|
||||
print(f'running last {opt.val_epochs} training epochs on the validation set')
|
||||
for val_epoch in range(1, opt.val_epochs + 1):
|
||||
train(model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile, writer=None)
|
||||
|
||||
# final test
|
||||
print('Training complete: testing')
|
||||
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None)
|
||||
|
||||
# writer.flush()
|
||||
# writer.close()
|
||||
exit('Code Executed!')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model')
|
||||
|
||||
parser.add_argument('--dataset', type=str,
|
||||
default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
|
||||
metavar='datasetpath', help=f'path to the pickled dataset')
|
||||
parser.add_argument('--nepochs', type=int, default=200, metavar='int',
|
||||
help='number of epochs (default: 200)')
|
||||
parser.add_argument('--lr', type=float, default=2e-5, metavar='float',
|
||||
help='learning rate (default: 2e-5)')
|
||||
parser.add_argument('--weight_decay', type=float, default=0, metavar='float',
|
||||
help='weight decay (default: 0)')
|
||||
parser.add_argument('--patience', type=int, default=10, metavar='int',
|
||||
help='patience for early-stop (default: 10)')
|
||||
parser.add_argument('--log-interval', type=int, default=20, metavar='int',
|
||||
help='how many batches to wait before printing training status')
|
||||
parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str',
|
||||
help='path to the log csv file')
|
||||
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
|
||||
parser.add_argument('--force', action='store_true', default=False,
|
||||
help='do not check if this experiment has already been run')
|
||||
parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str',
|
||||
help='path to the directory containing checkpoints')
|
||||
parser.add_argument('--plotmode', action='store_true', default=False,
|
||||
help='in plot mode executes a long run in order '
|
||||
'to generate enough data to produce trend plots (test-each should be >0. This mode is '
|
||||
'used to produce plots, and does not perform an evaluation on the test set.')
|
||||
parser.add_argument('--test-each', type=int, default=0, metavar='int',
|
||||
help='how many epochs to wait before invoking test (default: 0, only at the end)')
|
||||
parser.add_argument('--val-epochs', type=int, default=1, metavar='int',
|
||||
help='number of training epochs to perform on the validation set once training is over (default 1)')
|
||||
opt = parser.parse_args()
|
||||
|
||||
# Testing different parameters ...
|
||||
opt.weight_decay = 0.01
|
||||
opt.lr = 1e-5
|
||||
opt.patience = 5
|
||||
|
||||
main()
|
||||
# TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size
|
|
@ -1,110 +0,0 @@
|
|||
from experiment_scripts.main_mbert import *
|
||||
import pickle
|
||||
|
||||
|
||||
class ExtractorDataset(Dataset):
|
||||
"""
|
||||
data: dict of lang specific tokenized data
|
||||
labels: dict of lang specific targets
|
||||
"""
|
||||
|
||||
def __init__(self, data):
|
||||
self.langs = data.keys()
|
||||
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
|
||||
|
||||
for i, lang in enumerate(self.langs):
|
||||
_data = data[lang]['input_ids']
|
||||
_data = np.array(_data)
|
||||
_lang_value = np.full(len(_data), self.lang_ids[lang])
|
||||
|
||||
if i == 0:
|
||||
self.data = _data
|
||||
self.lang_index = _lang_value
|
||||
else:
|
||||
self.data = np.vstack((self.data, _data))
|
||||
self.lang_index = np.concatenate((self.lang_index, _lang_value))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
x = self.data[idx]
|
||||
lang = self.lang_index[idx]
|
||||
|
||||
return x, lang
|
||||
|
||||
def get_lang_ids(self):
|
||||
return self.lang_ids
|
||||
|
||||
|
||||
def feature_extractor(data, lang_ids, model_path='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0/'):
|
||||
print('# Feature Extractor Mode...')
|
||||
from transformers import BertConfig
|
||||
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, num_labels=300)
|
||||
model = BertForSequenceClassification.from_pretrained(model_path,
|
||||
config=config).cuda()
|
||||
|
||||
"""
|
||||
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
|
||||
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
|
||||
"""
|
||||
all_batch_embeddings = {}
|
||||
id2lang = {v:k for k,v in lang_ids.items()}
|
||||
with torch.no_grad():
|
||||
for batch, target, lang_idx in data:
|
||||
out = model(batch.cuda())
|
||||
last_hidden_state = out[1][-1]
|
||||
batch_embeddings = last_hidden_state[:, 0, :]
|
||||
for i, l_idx in enumerate(lang_idx.numpy()):
|
||||
if id2lang[l_idx] not in all_batch_embeddings.keys():
|
||||
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
|
||||
else:
|
||||
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
|
||||
batch_embeddings[i].detach().cpu().numpy()))
|
||||
|
||||
return all_batch_embeddings, id2lang
|
||||
|
||||
|
||||
def main():
|
||||
print('Running main ...')
|
||||
print(f'Model path: {opt.modelpath}\nDataset path: {opt.dataset}')
|
||||
DATAPATH = opt.dataset
|
||||
MAX_LEN = 512
|
||||
|
||||
l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
|
||||
l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN)
|
||||
l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN)
|
||||
|
||||
tr_dataset = TrainingDataset(l_tokenized_tr, l_devel_target)
|
||||
tr_lang_ids = tr_dataset.lang_ids
|
||||
|
||||
te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
|
||||
te_lang_ids = te_dataset.lang_ids
|
||||
|
||||
tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc embeddings
|
||||
te_dataloader = DataLoader(te_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc
|
||||
|
||||
tr_all_batch_embeddings, id2lang_tr = feature_extractor(tr_dataloader, tr_lang_ids, opt.modelpath) # Extracting doc embed for devel
|
||||
with open(f'{opt.modelpath}/TR_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile:
|
||||
pickle.dump((tr_all_batch_embeddings, id2lang_tr), outfile)
|
||||
|
||||
te_all_batch_embeddings, id2lang_te = feature_extractor(te_dataloader, te_lang_ids, opt.modelpath) # Extracting doc embed for test
|
||||
with open(f'{opt.modelpath}/TE_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile:
|
||||
pickle.dump((te_all_batch_embeddings, id2lang_te), outfile)
|
||||
|
||||
exit('Extraction completed!')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='mBert model document embedding extractor')
|
||||
|
||||
parser.add_argument('--dataset', type=str,
|
||||
default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
|
||||
metavar='datasetpath', help=f'path to the pickled dataset')
|
||||
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
|
||||
parser.add_argument('--modelpath', type=str, default='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0',
|
||||
metavar='modelpath', help=f'path to pre-trained mBert model')
|
||||
opt = parser.parse_args()
|
||||
|
||||
main()
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
import os
|
||||
from dataset_builder import MultilingualDataset
|
||||
from optparse import OptionParser
|
||||
from util.file import exists
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
parser = OptionParser(usage="usage: %prog datapath [options]")
|
||||
|
||||
(op, args) = parser.parse_args()
|
||||
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
|
||||
dataset = args[0]
|
||||
assert exists(dataset), 'Unable to find file '+str(dataset)
|
||||
|
||||
dataset_file = os.path.basename(dataset)
|
||||
|
||||
data = MultilingualDataset.load(dataset)
|
||||
data.set_view(languages=['it'])
|
||||
data.show_dimensions()
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
vect_lXtr = dict()
|
||||
vectorizer = CountVectorizer()
|
||||
vect_lXtr['it'] = vectorizer.fit_transform(lXtr['it'])
|
||||
# print(type(vect_lXtr['it']))
|
||||
|
||||
corr = vect_lXtr['it'].T.dot(lytr['it'])
|
||||
# print(corr.shape)
|
||||
sum_correlated_class = corr.sum(axis=0)
|
||||
print(len(sum_correlated_class))
|
||||
print(sum_correlated_class.max())
|
||||
|
||||
|
||||
w2idx = vectorizer.vocabulary_
|
||||
idx2w = {v:k for k,v in w2idx.items()}
|
||||
|
||||
word_tot_corr = corr.sum(axis=1)
|
||||
print(word_tot_corr.shape)
|
||||
dict_word_tot_corr = {v:k for k,v in enumerate(word_tot_corr)}
|
||||
|
||||
sorted_word_tot_corr = np.sort(word_tot_corr)
|
||||
sorted_word_tot_corr = sorted_word_tot_corr[len(sorted_word_tot_corr)-200:]
|
||||
|
||||
top_idx = [dict_word_tot_corr[k] for k in sorted_word_tot_corr]
|
||||
print([idx2w[idx] for idx in top_idx])
|
||||
print([elem for elem in top_idx])
|
||||
print(corr[8709])
|
||||
print('Finished...')
|
|
@ -1,34 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
|
||||
logfile=./results/final_combinations_jrc.csv
|
||||
#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
|
||||
# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
|
||||
# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
|
||||
|
||||
# aggregation=concatenation
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2
|
||||
#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2
|
||||
#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2
|
||||
#
|
||||
|
||||
##FeatureSetToPosteriors (aggregation mean)
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
|
||||
python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
|
||||
|
||||
##FeatureSetToPosteriors
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
|
||||
|
||||
#MajorityVoting
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r
|
||||
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
|
||||
logfile=./results/final_combinations_rcv.csv
|
||||
#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
|
||||
# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
|
||||
# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
|
||||
|
||||
# aggregation=concatenation
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2
|
||||
#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2
|
||||
#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2
|
||||
#
|
||||
##FeatureSetToPosteriors (aggregation mean)
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
|
||||
python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
|
||||
|
||||
##FeatureSetToPosteriors
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 --allprob
|
||||
#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
|
||||
|
||||
#MajorityVoting
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r
|
|
@ -1,31 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
logfile=../log/log_pre_jrc.csv
|
||||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
||||
python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20
|
|
@ -1,30 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
|
||||
python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
||||
python main_deep_learning.py $dataset --supervised --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20
|
|
@ -1,16 +0,0 @@
|
|||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
|
||||
seeds='5' #2 3 4 5 6 7 8 9 10'
|
||||
for seed in $seeds
|
||||
do
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
|
||||
python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed --force
|
||||
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
|
||||
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed --force
|
||||
|
||||
done
|
|
@ -1,20 +0,0 @@
|
|||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
|
||||
seeds='1 ' #2 3 4 5' # 6 7 8 9 10'
|
||||
for seed in $seeds
|
||||
do
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
|
||||
python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200
|
||||
|
||||
|
||||
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
|
||||
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed
|
||||
|
||||
# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed
|
||||
# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed
|
||||
done
|
|
@ -1,16 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
|
||||
#logfile=../log/log_FunBert_jrc.csv
|
||||
#
|
||||
#runs='0 1 2 3 4'
|
||||
#for run in $runs
|
||||
#do
|
||||
# dataset=$dataset_path$run.pickle
|
||||
# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile #--tunable
|
||||
#done
|
||||
|
||||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
|
||||
logfile=../log/log_FunBert_fulljrc_static.csv
|
||||
|
||||
python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile
|
|
@ -1,16 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||
#logfile=../log/log_FunBert_rcv_static.csv
|
||||
#
|
||||
#runs='0 1 2 3 4'
|
||||
#for run in $runs
|
||||
#do
|
||||
# dataset=$dataset_path$run.pickle
|
||||
# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile
|
||||
#done
|
||||
|
||||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
|
||||
logfile=../log/log_FunBert_fullrcv_static.csv
|
||||
|
||||
python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile
|
|
@ -1,15 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
|
||||
#logfile=../log/log_mBert_jrc_NEW.csv
|
||||
#
|
||||
#runs='0 1 2 3 4'
|
||||
#for run in $runs
|
||||
#do
|
||||
# dataset=$dataset_path$run.pickle
|
||||
# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
|
||||
#done
|
||||
|
||||
logfile=../log/log_mBert_fulljrc.csv
|
||||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
|
||||
python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
|
|
@ -1,15 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||
#logfile=../log/log_mBert_rcv_NEW.csv
|
||||
#
|
||||
#runs='0 1 2 3 4'
|
||||
#for run in $runs
|
||||
#do
|
||||
# dataset=$dataset_path$run.pickle
|
||||
# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
|
||||
#done
|
||||
|
||||
logfile=../log/log_mBert_fullrcv.csv
|
||||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
|
||||
python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=30 --patience 3
|
|
@ -1,45 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
|
||||
|
||||
######################################## POSTERIORS
|
||||
# Posteriors
|
||||
python main_multimodal_cls.py $dataset -P # + zscore
|
||||
python main_multimodal_cls.py $dataset -P -z # +l2norm
|
||||
python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
|
||||
|
||||
|
||||
######################################### WCE
|
||||
#WCE supervised
|
||||
python main_multimodal_cls.py $dataset -S # + zscore
|
||||
python main_multimodal_cls.py $dataset -S -z # +l2norm
|
||||
python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
|
||||
python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
|
||||
|
||||
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 # +feature weight + pca
|
||||
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 # + SIF
|
||||
|
||||
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
|
||||
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
|
||||
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig # + pca
|
||||
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig
|
||||
|
||||
|
||||
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi
|
||||
|
||||
################################# MUSE
|
||||
|
||||
# MUSE unsupervised
|
||||
python main_multimodal_cls.py $dataset -U # + zscore
|
||||
python main_multimodal_cls.py $dataset -U -z # +l2norm
|
||||
python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
|
||||
python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
|
||||
|
||||
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
|
||||
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
|
||||
|
||||
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi
|
|
@ -1,45 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
|
||||
|
||||
######################################## POSTERIORS
|
||||
# Posteriors
|
||||
python main_multimodal_cls.py $dataset -P # + zscore
|
||||
python main_multimodal_cls.py $dataset -P -z # +l2norm
|
||||
python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
|
||||
|
||||
|
||||
######################################### WCE
|
||||
#WCE supervised
|
||||
python main_multimodal_cls.py $dataset -S # + zscore
|
||||
python main_multimodal_cls.py $dataset -S -z # +l2norm
|
||||
python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
|
||||
python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
|
||||
|
||||
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 # +feature weight + pca
|
||||
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 # + SIF
|
||||
|
||||
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
|
||||
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
|
||||
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig # + pca
|
||||
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig
|
||||
|
||||
|
||||
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi
|
||||
|
||||
################################# MUSE
|
||||
|
||||
# MUSE unsupervised
|
||||
python main_multimodal_cls.py $dataset -U # + zscore
|
||||
python main_multimodal_cls.py $dataset -U -z # +l2norm
|
||||
python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
|
||||
python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
|
||||
|
||||
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
|
||||
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
|
||||
|
||||
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi
|
|
@ -1,6 +0,0 @@
|
|||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
|
||||
seeds='1 2 3 4 5 6 7 8 9 10'
|
||||
for seed in $seeds
|
||||
do
|
||||
python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised --nepochs 50 --seed $seed
|
||||
done
|
|
@ -0,0 +1,124 @@
|
|||
from src.models.learners import *
|
||||
from src.util.common import _normalize
|
||||
from src.view_generators import VanillaFunGen
|
||||
|
||||
|
||||
class DocEmbedderList:
|
||||
"""
|
||||
Class that takes care of calling fit and transform function for every init embedder. Every ViewGenerator should be
|
||||
contained by this class in order to seamlessly train the overall architecture.
|
||||
"""
|
||||
def __init__(self, embedder_list, probabilistic=True):
|
||||
"""
|
||||
Init the DocEmbedderList.
|
||||
:param embedder_list: list of embedders to be deployed
|
||||
:param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not
|
||||
"""
|
||||
assert len(embedder_list) != 0, 'Embedder list cannot be empty!'
|
||||
self.embedders = embedder_list
|
||||
self.probabilistic = probabilistic
|
||||
if probabilistic:
|
||||
_tmp = []
|
||||
for embedder in self.embedders:
|
||||
if isinstance(embedder, VanillaFunGen):
|
||||
_tmp.append(embedder)
|
||||
else:
|
||||
_tmp.append(FeatureSet2Posteriors(embedder))
|
||||
self.embedders = _tmp
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
Fit all the ViewGenerators contained by DocEmbedderList.
|
||||
:param lX:
|
||||
:param ly:
|
||||
:return: self
|
||||
"""
|
||||
for embedder in self.embedders:
|
||||
embedder.fit(lX, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
Project documents by means of every ViewGenerators. Projections are then averaged together and returned.
|
||||
:param lX:
|
||||
:return: common latent space (averaged).
|
||||
"""
|
||||
langs = sorted(lX.keys())
|
||||
lZparts = {lang: None for lang in langs}
|
||||
|
||||
for embedder in self.embedders:
|
||||
lZ = embedder.transform(lX)
|
||||
for lang in langs:
|
||||
Z = lZ[lang]
|
||||
if lZparts[lang] is None:
|
||||
lZparts[lang] = Z
|
||||
else:
|
||||
lZparts[lang] += Z
|
||||
n_embedders = len(self.embedders)
|
||||
return {lang: lZparts[lang]/n_embedders for lang in langs} # Averaging feature spaces
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
class FeatureSet2Posteriors:
|
||||
"""
|
||||
Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of
|
||||
a multiclass SVM.
|
||||
"""
|
||||
def __init__(self, embedder, l2=True, n_jobs=-1):
|
||||
"""
|
||||
Init the class.
|
||||
:param embedder: ViewGen, view generators which does not natively outputs posterior probabilities.
|
||||
:param l2: bool, whether to apply or not L2 normalization to the projection
|
||||
:param n_jobs: int, number of concurrent workers.
|
||||
"""
|
||||
self.embedder = embedder
|
||||
self.l2 = l2
|
||||
self.n_jobs = n_jobs
|
||||
self.prob_classifier = MetaClassifier(
|
||||
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
||||
|
||||
def fit(self, lX, ly):
|
||||
lZ = self.embedder.fit_transform(lX, ly)
|
||||
self.prob_classifier.fit(lZ, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
lP = self.predict_proba(lX)
|
||||
lP = _normalize(lP, self.l2)
|
||||
return lP
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def predict(self, lX):
|
||||
lZ = self.embedder.transform(lX)
|
||||
return self.prob_classifier.predict(lZ)
|
||||
|
||||
def predict_proba(self, lX):
|
||||
lZ = self.embedder.transform(lX)
|
||||
return self.prob_classifier.predict_proba(lZ)
|
||||
|
||||
|
||||
class Funnelling:
|
||||
"""
|
||||
Funnelling Architecture. It is composed by two tiers. The first-tier is a set of heterogeneous document embedders.
|
||||
The second-tier (i.e., the metaclassifier), operates the classification of the common latent space computed by
|
||||
the first-tier learners.
|
||||
"""
|
||||
def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1):
|
||||
self.first_tier = first_tier
|
||||
self.meta = meta_classifier
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, lX, ly):
|
||||
print('## Fitting first-tier learners!')
|
||||
lZ = self.first_tier.fit_transform(lX, ly)
|
||||
print('## Fitting meta-learner!')
|
||||
self.meta.fit(lZ, ly)
|
||||
|
||||
def predict(self, lX):
|
||||
lZ = self.first_tier.transform(lX)
|
||||
ly = self.meta.predict(lZ)
|
||||
return ly
|
|
@ -1,849 +0,0 @@
|
|||
from torch.optim.lr_scheduler import StepLR
|
||||
from torch.utils.data import DataLoader
|
||||
from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain
|
||||
from embeddings.embeddings import FastTextMUSE
|
||||
from embeddings.supervised import supervised_embeddings_tfidf, zscores
|
||||
from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling
|
||||
from sklearn.decomposition import PCA
|
||||
from scipy.sparse import hstack
|
||||
from util_transformers.StandardizeTransformer import StandardizeTransformer
|
||||
from util.SIF_embed import remove_pc
|
||||
from sklearn.preprocessing import normalize
|
||||
from scipy.sparse import csr_matrix
|
||||
from models.mBert import *
|
||||
from models.lstm_class import *
|
||||
from util.csv_log import CSVLog
|
||||
from util.file import get_file_name
|
||||
from util.early_stop import EarlyStopping
|
||||
from util.common import *
|
||||
import time
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Data Processing
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
class FeatureWeight:
|
||||
|
||||
def __init__(self, weight='tfidf', agg='mean'):
|
||||
assert weight in ['tfidf', 'pmi', 'ig'] or callable(
|
||||
weight), 'weight should either be "tfidf" or a callable function'
|
||||
assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"'
|
||||
self.weight = weight
|
||||
self.agg = agg
|
||||
self.fitted = False
|
||||
if weight == 'pmi':
|
||||
self.weight = pointwise_mutual_information
|
||||
elif weight == 'ig':
|
||||
self.weight = information_gain
|
||||
|
||||
def fit(self, lX, ly):
|
||||
if not self.fitted:
|
||||
if self.weight == 'tfidf':
|
||||
self.lF = {l: np.ones(X.shape[1]) for l, X in lX.items()}
|
||||
else:
|
||||
self.lF = {}
|
||||
for l in lX.keys():
|
||||
X, y = lX[l], ly[l]
|
||||
|
||||
print(f'getting supervised cell-matrix lang {l}')
|
||||
tsr_matrix = get_tsr_matrix(get_supervised_matrix(X, y), tsr_score_funtion=self.weight)
|
||||
if self.agg == 'max':
|
||||
F = tsr_matrix.max(axis=0)
|
||||
elif self.agg == 'mean':
|
||||
F = tsr_matrix.mean(axis=0)
|
||||
self.lF[l] = F
|
||||
|
||||
self.fitted = True
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
return {lang: csr_matrix.multiply(lX[lang], self.lF[lang]) for lang in lX.keys()}
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# View Generators (aka first-tier learners)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
class PosteriorProbabilitiesEmbedder:
|
||||
|
||||
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1):
|
||||
self.fist_tier_learner = first_tier_learner
|
||||
self.fist_tier_parameters = first_tier_parameters
|
||||
self.l2 = l2
|
||||
self.n_jobs = n_jobs
|
||||
self.doc_projector = NaivePolylingualClassifier(
|
||||
self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
|
||||
)
|
||||
self.requires_tfidf = True
|
||||
|
||||
def fit(self, lX, lY, lV=None, called_by_viewgen=False):
|
||||
if not called_by_viewgen:
|
||||
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
|
||||
print('### Posterior Probabilities View Generator (X)')
|
||||
print('fitting the projectors... {}'.format(lX.keys()))
|
||||
self.doc_projector.fit(lX, lY)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
lZ = self.predict_proba(lX)
|
||||
lZ = _normalize(lZ, self.l2)
|
||||
return lZ
|
||||
|
||||
def fit_transform(self, lX, ly=None, lV=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def best_params(self):
|
||||
return self.doc_projector.best_params()
|
||||
|
||||
def predict(self, lX, ly=None):
|
||||
return self.doc_projector.predict(lX)
|
||||
|
||||
def predict_proba(self, lX, ly=None):
|
||||
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents')
|
||||
return self.doc_projector.predict_proba(lX)
|
||||
|
||||
def _get_output_dim(self):
|
||||
return len(self.doc_projector.model['da'].model.classes_)
|
||||
|
||||
|
||||
class MuseEmbedder:
|
||||
|
||||
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False):
|
||||
self.path = path
|
||||
self.lV = lV
|
||||
self.l2 = l2
|
||||
self.n_jobs = n_jobs
|
||||
self.featureweight = featureweight
|
||||
self.sif = sif
|
||||
self.requires_tfidf = True
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
assert lV is not None or self.lV is not None, 'lV not specified'
|
||||
print('### MUSE View Generator (M)')
|
||||
print(f'Loading fastText pretrained vectors for languages {list(lX.keys())}...')
|
||||
self.langs = sorted(lX.keys())
|
||||
self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
|
||||
lWordList = {l: self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
|
||||
self.MUSE = {l: Muse.extract(lWordList[l]).numpy() for l, Muse in self.MUSE.items()}
|
||||
self.featureweight.fit(lX, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
MUSE = self.MUSE
|
||||
lX = self.featureweight.transform(lX)
|
||||
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs
|
||||
)
|
||||
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
|
||||
lMuse = _normalize(lMuse, self.l2)
|
||||
return lMuse
|
||||
|
||||
def fit_transform(self, lX, ly, lV):
|
||||
return self.fit(lX, ly, lV).transform(lX)
|
||||
|
||||
def _get_wordlist_from_word2index(self, word2index):
|
||||
return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0]
|
||||
|
||||
def _get_output_dim(self):
|
||||
return self.MUSE['da'].shape[1]
|
||||
|
||||
|
||||
class WordClassEmbedder:
|
||||
|
||||
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False):
|
||||
self.n_jobs = n_jobs
|
||||
self.l2 = l2
|
||||
self.max_label_space = max_label_space
|
||||
self.featureweight = featureweight
|
||||
self.sif = sif
|
||||
self.requires_tfidf = True
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
print('### WCE View Generator (M)')
|
||||
print('Computing supervised embeddings...')
|
||||
self.langs = sorted(lX.keys())
|
||||
WCE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs
|
||||
)
|
||||
self.lWCE = {l: WCE[i] for i, l in enumerate(self.langs)}
|
||||
self.featureweight.fit(lX, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
lWCE = self.lWCE
|
||||
lX = self.featureweight.transform(lX)
|
||||
XdotWCE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], lWCE[lang], self.sif) for lang in self.langs
|
||||
)
|
||||
lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
|
||||
lwce = _normalize(lwce, self.l2)
|
||||
return lwce
|
||||
|
||||
def fit_transform(self, lX, ly, lV=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def _get_output_dim(self):
|
||||
return 73 # TODO !
|
||||
|
||||
|
||||
class MBertEmbedder:
|
||||
|
||||
def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None,
|
||||
nC=None):
|
||||
self.doc_embed_path = doc_embed_path
|
||||
self.patience = patience
|
||||
self.checkpoint_dir = checkpoint_dir
|
||||
self.fitted = False
|
||||
self.requires_tfidf = False
|
||||
if path_to_model is None and nC is not None:
|
||||
self.model = None
|
||||
else:
|
||||
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
|
||||
num_labels=nC)
|
||||
self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda()
|
||||
self.fitted = True
|
||||
|
||||
def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1):
|
||||
print('### mBERT View Generator (B)')
|
||||
if self.fitted is True:
|
||||
print('Bert model already fitted!')
|
||||
return self
|
||||
|
||||
print('Fine-tune mBert on the given dataset.')
|
||||
l_tokenized_tr = do_tokenization(lX, max_len=512)
|
||||
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly,
|
||||
val_prop=0.2, max_val=2000,
|
||||
seed=seed) # TODO: seed
|
||||
|
||||
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
|
||||
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
|
||||
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
|
||||
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
|
||||
|
||||
nC = tr_dataset.get_nclasses()
|
||||
model = get_model(nC)
|
||||
model = model.cuda()
|
||||
criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||
optim = init_optimizer(model, lr=lr, weight_decay=0.01)
|
||||
lr_scheduler = StepLR(optim, step_size=25, gamma=0.1)
|
||||
early_stop = EarlyStopping(model, optimizer=optim, patience=self.patience,
|
||||
checkpoint=self.checkpoint_dir,
|
||||
is_bert=True)
|
||||
|
||||
# Training loop
|
||||
logfile = '../log/log_mBert_extractor.csv'
|
||||
method_name = 'mBert_feature_extractor'
|
||||
|
||||
tinit = time()
|
||||
lang_ids = va_dataset.lang_ids
|
||||
for epoch in range(1, nepochs + 1):
|
||||
print('# Start Training ...')
|
||||
train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile)
|
||||
lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
|
||||
|
||||
# Validation
|
||||
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
|
||||
early_stop(macrof1, epoch)
|
||||
|
||||
if early_stop.STOP:
|
||||
print('[early-stop] STOP')
|
||||
break
|
||||
|
||||
model = early_stop.restore_checkpoint()
|
||||
self.model = model.cuda()
|
||||
|
||||
if val_epochs > 0:
|
||||
print(f'running last {val_epochs} training epochs on the validation set')
|
||||
for val_epoch in range(1, val_epochs + 1):
|
||||
train(self.model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile)
|
||||
|
||||
self.fitted = True
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
assert self.fitted is True, 'Calling transform without any initialized model! - call init first or on init' \
|
||||
'pass the "path_to_model" arg.'
|
||||
print('Obtaining document embeddings from pretrained mBert ')
|
||||
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
|
||||
feat_dataset = ExtractorDataset(l_tokenized_X)
|
||||
feat_lang_ids = feat_dataset.lang_ids
|
||||
dataloader = DataLoader(feat_dataset, batch_size=64)
|
||||
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
|
||||
return all_batch_embeddings
|
||||
|
||||
def fit_transform(self, lX, ly, lV=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
class RecurrentEmbedder:
|
||||
|
||||
def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3,
|
||||
we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10,
|
||||
test_each=0, checkpoint_dir='../checkpoint', model_path=None):
|
||||
self.pretrained = pretrained
|
||||
self.supervised = supervised
|
||||
self.concat = concat
|
||||
self.requires_tfidf = False
|
||||
self.multilingual_dataset = multilingual_dataset
|
||||
self.model = None
|
||||
self.we_path = we_path
|
||||
self.langs = multilingual_dataset.langs()
|
||||
self.hidden_size = hidden_size
|
||||
self.sup_drop = sup_drop
|
||||
self.posteriors = posteriors
|
||||
self.patience = patience
|
||||
self.checkpoint_dir = checkpoint_dir
|
||||
self.test_each = test_each
|
||||
self.options = options
|
||||
self.seed = options.seed
|
||||
self.is_trained = False
|
||||
|
||||
## INIT MODEL for training
|
||||
self.lXtr, self.lytr = self.multilingual_dataset.training(target_as_csr=True)
|
||||
self.lXte, self.lyte = self.multilingual_dataset.test(target_as_csr=True)
|
||||
self.nC = self.lyte[self.langs[0]].shape[1]
|
||||
lpretrained, lpretrained_vocabulary = self._load_pretrained_embeddings(self.we_path, self.langs)
|
||||
self.multilingual_index = MultilingualIndex()
|
||||
self.multilingual_index.index(self.lXtr, self.lytr, self.lXte, lpretrained_vocabulary)
|
||||
self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed)
|
||||
self.multilingual_index.embedding_matrices(lpretrained, self.supervised)
|
||||
|
||||
if model_path is not None:
|
||||
self.is_trained = True
|
||||
self.model = torch.load(model_path)
|
||||
else:
|
||||
self.model = self._init_Net()
|
||||
|
||||
self.optim = init_optimizer(self.model, lr=lr)
|
||||
self.criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||
self.lr_scheduler = StepLR(self.optim, step_size=25, gamma=0.5)
|
||||
self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience,
|
||||
checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}')
|
||||
# Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities
|
||||
self.posteriorEmbedder = MetaClassifier(
|
||||
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs)
|
||||
|
||||
def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1):
|
||||
print('### Gated Recurrent Unit View Generator (G)')
|
||||
# could be better to init model here at first .fit() call!
|
||||
if self.model is None:
|
||||
print('TODO: Init model!')
|
||||
if not self.is_trained:
|
||||
# Batchify input
|
||||
self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed)
|
||||
l_train_index, l_train_target = self.multilingual_index.l_train()
|
||||
l_val_index, l_val_target = self.multilingual_index.l_val()
|
||||
l_test_index = self.multilingual_index.l_test_index()
|
||||
batcher_train = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
|
||||
lpad=self.multilingual_index.l_pad())
|
||||
batcher_eval = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
|
||||
lpad=self.multilingual_index.l_pad())
|
||||
|
||||
# Train loop
|
||||
print('Start training')
|
||||
method_name = 'gru_view_generator'
|
||||
logfile = init_logfile_nn(method_name, self.options)
|
||||
tinit = time.time()
|
||||
for epoch in range(1, nepochs + 1):
|
||||
train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target,
|
||||
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
|
||||
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
|
||||
ltrain_bert=None)
|
||||
self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
|
||||
|
||||
# validation step
|
||||
macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch,
|
||||
logfile, self.criterion, 'va')
|
||||
|
||||
self.early_stop(macrof1, epoch)
|
||||
if self.test_each > 0:
|
||||
test_gru(self.model, batcher_eval, l_test_index, None, None, self.lyte, tinit, epoch,
|
||||
logfile, self.criterion, 'te')
|
||||
|
||||
if self.early_stop.STOP:
|
||||
print('[early-stop] STOP')
|
||||
print('Restoring best model...')
|
||||
break
|
||||
|
||||
self.model = self.early_stop.restore_checkpoint()
|
||||
print(f'running last {val_epochs} training epochs on the validation set')
|
||||
for val_epoch in range(1, val_epochs+1):
|
||||
batcher_train.init_offset()
|
||||
train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target,
|
||||
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
|
||||
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
|
||||
ltrain_bert=None)
|
||||
self.is_trained = True
|
||||
|
||||
# Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities
|
||||
lX = self._get_doc_embeddings(lX)
|
||||
# Fit a ''multi-lingual'' SVM on the generated doc embeddings
|
||||
self.posteriorEmbedder.fit(lX, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX, batch_size=64):
|
||||
lX = self._get_doc_embeddings(lX)
|
||||
return self.posteriorEmbedder.predict_proba(lX)
|
||||
|
||||
def fit_transform(self, lX, ly, lV=None):
|
||||
# TODO
|
||||
return 0
|
||||
|
||||
def _get_doc_embeddings(self, lX, batch_size=64):
|
||||
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
|
||||
print('Generating document embeddings via GRU')
|
||||
data = {}
|
||||
for lang in lX.keys():
|
||||
indexed = index(data=lX[lang],
|
||||
vocab=self.multilingual_index.l_index[lang].word2index,
|
||||
known_words=set(self.multilingual_index.l_index[lang].word2index.keys()),
|
||||
analyzer=self.multilingual_index.l_vectorizer.get_analyzer(lang),
|
||||
unk_index=self.multilingual_index.l_index[lang].unk_index,
|
||||
out_of_vocabulary=self.multilingual_index.l_index[lang].out_of_vocabulary)
|
||||
data[lang] = indexed
|
||||
|
||||
lX = {}
|
||||
ly = {}
|
||||
batcher_transform = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
|
||||
lpad=self.multilingual_index.l_pad())
|
||||
|
||||
# l_devel_index = self.multilingual_index.l_devel_index()
|
||||
|
||||
l_devel_target = self.multilingual_index.l_devel_target()
|
||||
l_devel_target = {k: v[:len(data[k])] for k, v in l_devel_target.items()} # todo -> debug
|
||||
for batch, _, target, lang, in batchify(l_index=data,
|
||||
l_post=None,
|
||||
llabels=l_devel_target,
|
||||
batchsize=batch_size,
|
||||
lpad=self.multilingual_index.l_pad()):
|
||||
# for idx, (batch, post, bert_emb, target, lang) in enumerate(
|
||||
# batcher_transform.batchify(l_devel_index, None, None, l_devel_target)):
|
||||
# for idx, (batch, post, bert_emb, target, lang) in enumerate(
|
||||
# batcher_transform.batchify(data, None, None, l_devel_target)):
|
||||
if lang not in lX.keys():
|
||||
lX[lang] = self.model.get_embeddings(batch, lang)
|
||||
ly[lang] = target.cpu().detach().numpy()
|
||||
else:
|
||||
lX[lang] = np.concatenate((lX[lang], self.model.get_embeddings(batch, lang)), axis=0)
|
||||
ly[lang] = np.concatenate((ly[lang], target.cpu().detach().numpy()), axis=0)
|
||||
|
||||
return lX
|
||||
|
||||
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
|
||||
def _load_pretrained_embeddings(self, we_path, langs):
|
||||
lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ?
|
||||
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
|
||||
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
|
||||
return lpretrained, lpretrained_vocabulary
|
||||
|
||||
def _none_dict(self, langs):
|
||||
return {l:None for l in langs}
|
||||
|
||||
# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested
|
||||
def _init_Net(self, xavier_uniform=True):
|
||||
model = RNNMultilingualClassifier(
|
||||
output_size=self.nC,
|
||||
hidden_size=self.hidden_size,
|
||||
lvocab_size=self.multilingual_index.l_vocabsize(),
|
||||
learnable_length=0,
|
||||
lpretrained=self.multilingual_index.l_embeddings(),
|
||||
drop_embedding_range=self.multilingual_index.sup_range,
|
||||
drop_embedding_prop=self.sup_drop,
|
||||
post_probabilities=self.posteriors
|
||||
)
|
||||
return model.cuda()
|
||||
|
||||
|
||||
class DocEmbedderList:
|
||||
|
||||
def __init__(self, *embedder_list, aggregation='concat'):
|
||||
assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"'
|
||||
if len(embedder_list) == 0:
|
||||
embedder_list = []
|
||||
self.embedders = embedder_list
|
||||
self.aggregation = aggregation
|
||||
print(f'Aggregation mode: {self.aggregation}')
|
||||
|
||||
def fit(self, lX, ly, lV=None, tfidf=None):
|
||||
for transformer in self.embedders:
|
||||
_lX = lX
|
||||
if transformer.requires_tfidf:
|
||||
_lX = tfidf
|
||||
transformer.fit(_lX, ly, lV)
|
||||
return self
|
||||
|
||||
def transform(self, lX, tfidf=None):
|
||||
if self.aggregation == 'concat':
|
||||
return self.transform_concat(lX, tfidf)
|
||||
elif self.aggregation == 'mean':
|
||||
return self.transform_mean(lX, tfidf)
|
||||
|
||||
def transform_concat(self, lX, tfidf):
|
||||
if len(self.embedders) == 1:
|
||||
if self.embedders[0].requires_tfidf:
|
||||
lX = tfidf
|
||||
return self.embedders[0].transform(lX)
|
||||
|
||||
some_sparse = False
|
||||
langs = sorted(lX.keys())
|
||||
|
||||
lZparts = {l: [] for l in langs}
|
||||
for transformer in self.embedders:
|
||||
_lX = lX
|
||||
if transformer.requires_tfidf:
|
||||
_lX = tfidf
|
||||
lZ = transformer.transform(_lX)
|
||||
for l in langs:
|
||||
Z = lZ[l]
|
||||
some_sparse = some_sparse or issparse(Z)
|
||||
lZparts[l].append(Z)
|
||||
|
||||
hstacker = hstack if some_sparse else np.hstack
|
||||
return {l: hstacker(lZparts[l]) for l in langs}
|
||||
|
||||
def transform_mean(self, lX, tfidf):
|
||||
if len(self.embedders) == 1:
|
||||
return self.embedders[0].transform(lX)
|
||||
|
||||
langs = sorted(lX.keys())
|
||||
|
||||
lZparts = {l: None for l in langs}
|
||||
|
||||
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
|
||||
min_dim = 73 # TODO <---- this should be the number of target classes
|
||||
|
||||
for transformer in self.embedders:
|
||||
_lX = lX
|
||||
if transformer.requires_tfidf:
|
||||
_lX = tfidf
|
||||
lZ = transformer.transform(_lX)
|
||||
nC = min([lZ[lang].shape[1] for lang in langs])
|
||||
for l in langs:
|
||||
Z = lZ[l]
|
||||
if Z.shape[1] > min_dim:
|
||||
print(
|
||||
f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
|
||||
f'Applying PCA(n_components={min_dim})')
|
||||
pca = PCA(n_components=min_dim)
|
||||
Z = pca.fit(Z).transform(Z)
|
||||
if lZparts[l] is None:
|
||||
lZparts[l] = Z
|
||||
else:
|
||||
lZparts[l] += Z
|
||||
|
||||
n_transformers = len(self.embedders)
|
||||
|
||||
return {l: lZparts[l] / n_transformers for l in langs}
|
||||
|
||||
def fit_transform(self, lX, ly, lV=None, tfidf=None):
|
||||
return self.fit(lX, ly, lV, tfidf).transform(lX, tfidf)
|
||||
|
||||
def best_params(self):
|
||||
return {'todo'}
|
||||
|
||||
def append(self, embedder):
|
||||
self.embedders.append(embedder)
|
||||
|
||||
|
||||
class FeatureSet2Posteriors:
|
||||
def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1):
|
||||
self.transformer = transformer
|
||||
self.l2 = l2
|
||||
self.n_jobs = n_jobs
|
||||
self.prob_classifier = MetaClassifier(
|
||||
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
||||
self.requires_tfidf = requires_tfidf
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
if lV is None and hasattr(self.transformer, 'lV'):
|
||||
lV = self.transformer.lV
|
||||
lZ = self.transformer.fit_transform(lX, ly, lV)
|
||||
self.prob_classifier.fit(lZ, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
lP = self.predict_proba(lX)
|
||||
lP = _normalize(lP, self.l2)
|
||||
return lP
|
||||
|
||||
def fit_transform(self, lX, ly, lV):
|
||||
return self.fit(lX, ly, lV).transform(lX)
|
||||
|
||||
def predict(self, lX, ly=None):
|
||||
lZ = self.transformer.transform(lX)
|
||||
return self.prob_classifier.predict(lZ)
|
||||
|
||||
def predict_proba(self, lX, ly=None):
|
||||
lZ = self.transformer.transform(lX)
|
||||
return self.prob_classifier.predict_proba(lZ)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Meta-Classifier (aka second-tier learner)
|
||||
# ------------------------------------------------------------------
|
||||
class MetaClassifier:
|
||||
|
||||
def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None):
|
||||
self.n_jobs = n_jobs
|
||||
self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
|
||||
self.standardize_range = standardize_range
|
||||
|
||||
def fit(self, lZ, ly):
|
||||
tinit = time.time()
|
||||
Z, y = self.stack(lZ, ly)
|
||||
|
||||
self.standardizer = StandardizeTransformer(range=self.standardize_range)
|
||||
Z = self.standardizer.fit_transform(Z)
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(Z.shape))
|
||||
self.model.fit(Z, y)
|
||||
self.time = time.time() - tinit
|
||||
|
||||
def stack(self, lZ, ly=None):
|
||||
langs = list(lZ.keys())
|
||||
Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
|
||||
if ly is not None:
|
||||
y = np.vstack([ly[lang] for lang in langs])
|
||||
return Z, y
|
||||
else:
|
||||
return Z
|
||||
|
||||
def predict(self, lZ, ly=None):
|
||||
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
|
||||
def predict_proba(self, lZ, ly=None):
|
||||
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
|
||||
return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs)
|
||||
|
||||
def best_params(self):
|
||||
return self.model.best_params()
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Ensembling (aka Funnelling)
|
||||
# ------------------------------------------------------------------
|
||||
class Funnelling:
|
||||
def __init__(self,
|
||||
vectorizer: TfidfVectorizerMultilingual,
|
||||
first_tier: DocEmbedderList,
|
||||
meta: MetaClassifier):
|
||||
self.vectorizer = vectorizer
|
||||
self.first_tier = first_tier
|
||||
self.meta = meta
|
||||
self.n_jobs = meta.n_jobs
|
||||
|
||||
def fit(self, lX, ly):
|
||||
tfidf_lX = self.vectorizer.fit_transform(lX, ly)
|
||||
lV = self.vectorizer.vocabulary()
|
||||
print('## Fitting first-tier learners!')
|
||||
lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX)
|
||||
print('## Fitting meta-learner!')
|
||||
self.meta.fit(lZ, ly)
|
||||
|
||||
def predict(self, lX, ly=None):
|
||||
tfidf_lX = self.vectorizer.transform(lX)
|
||||
lZ = self.first_tier.transform(lX, tfidf=tfidf_lX)
|
||||
ly_ = self.meta.predict(lZ)
|
||||
return ly_
|
||||
|
||||
def best_params(self):
|
||||
return {'1st-tier': self.first_tier.best_params(),
|
||||
'meta': self.meta.best_params()}
|
||||
|
||||
|
||||
class Voting:
|
||||
def __init__(self, *prob_classifiers):
|
||||
assert all([hasattr(p, 'predict_proba') for p in prob_classifiers]), 'not all classifiers are probabilistic'
|
||||
self.prob_classifiers = prob_classifiers
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
for classifier in self.prob_classifiers:
|
||||
classifier.fit(lX, ly, lV)
|
||||
|
||||
def predict(self, lX, ly=None):
|
||||
lP = {l: [] for l in lX.keys()}
|
||||
for classifier in self.prob_classifiers:
|
||||
lPi = classifier.predict_proba(lX)
|
||||
for l in lX.keys():
|
||||
lP[l].append(lPi[l])
|
||||
|
||||
lP = {l: np.stack(Plist).mean(axis=0) for l, Plist in lP.items()}
|
||||
ly = {l: P > 0.5 for l, P in lP.items()}
|
||||
|
||||
return ly
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# HELPERS
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
def load_muse_embeddings(we_path, langs, n_jobs=-1):
|
||||
MUSE = Parallel(n_jobs=n_jobs)(
|
||||
delayed(FastTextMUSE)(we_path, lang) for lang in langs
|
||||
)
|
||||
return {l: MUSE[i] for i, l in enumerate(langs)}
|
||||
|
||||
|
||||
def word_class_embedding_matrix(X, Y, max_label_space=300):
|
||||
WCE = supervised_embeddings_tfidf(X, Y)
|
||||
WCE = zscores(WCE, axis=0)
|
||||
|
||||
nC = Y.shape[1]
|
||||
if nC > max_label_space:
|
||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
f'Applying PCA(n_components={max_label_space})')
|
||||
pca = PCA(n_components=max_label_space)
|
||||
WCE = pca.fit(WCE).transform(WCE)
|
||||
|
||||
return WCE
|
||||
|
||||
|
||||
def XdotM(X, M, sif):
|
||||
E = X.dot(M)
|
||||
if sif:
|
||||
print("removing pc...")
|
||||
E = remove_pc(E, npc=1)
|
||||
return E
|
||||
|
||||
|
||||
def _normalize(lX, l2=True):
|
||||
return {l: normalize(X) for l, X in lX.items()} if l2 else lX
|
||||
|
||||
|
||||
class BatchGRU:
|
||||
def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
|
||||
self.batchsize = batchsize
|
||||
self.batches_per_epoch = batches_per_epoch
|
||||
self.languages = languages
|
||||
self.lpad=lpad
|
||||
self.max_pad_length=max_pad_length
|
||||
self.init_offset()
|
||||
|
||||
def init_offset(self):
|
||||
self.offset = {lang: 0 for lang in self.languages}
|
||||
|
||||
def batchify(self, l_index, l_post, l_bert, llabels):
|
||||
langs = self.languages
|
||||
l_num_samples = {l:len(l_index[l]) for l in langs}
|
||||
|
||||
max_samples = max(l_num_samples.values())
|
||||
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
|
||||
if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches:
|
||||
n_batches = self.batches_per_epoch
|
||||
|
||||
for b in range(n_batches):
|
||||
for lang in langs:
|
||||
index, labels = l_index[lang], llabels[lang]
|
||||
offset = self.offset[lang]
|
||||
if offset >= l_num_samples[lang]:
|
||||
offset = 0
|
||||
limit = offset+self.batchsize
|
||||
|
||||
batch_slice = slice(offset, limit)
|
||||
batch = index[batch_slice]
|
||||
batch_labels = labels[batch_slice].toarray()
|
||||
|
||||
post = None
|
||||
bert_emb = None
|
||||
|
||||
batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
|
||||
batch = torch.LongTensor(batch).cuda()
|
||||
target = torch.FloatTensor(batch_labels).cuda()
|
||||
|
||||
self.offset[lang] = limit
|
||||
|
||||
yield batch, post, bert_emb, target, lang
|
||||
|
||||
|
||||
def pad(index_list, pad_index, max_pad_length=None):
|
||||
pad_length = np.max([len(index) for index in index_list])
|
||||
if max_pad_length is not None:
|
||||
pad_length = min(pad_length, max_pad_length)
|
||||
for i,indexes in enumerate(index_list):
|
||||
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
|
||||
return index_list
|
||||
|
||||
|
||||
def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, optim, epoch, method_name, opt,
|
||||
ltrain_posteriors=None, ltrain_bert=None, log_interval=10):
|
||||
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
loss_history = []
|
||||
model.train()
|
||||
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
|
||||
optim.zero_grad()
|
||||
loss = criterion(model(batch, post, bert_emb, lang), target)
|
||||
loss.backward()
|
||||
clip_gradient(model)
|
||||
optim.step()
|
||||
loss_history.append(loss.item())
|
||||
|
||||
if idx % log_interval == 0:
|
||||
interval_loss = np.mean(loss_history[-log_interval:])
|
||||
print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, '
|
||||
f'Training Loss: {interval_loss:.6f}')
|
||||
|
||||
mean_loss = np.mean(interval_loss)
|
||||
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time.time() - tinit)
|
||||
return mean_loss
|
||||
|
||||
|
||||
def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix):
|
||||
loss_history = []
|
||||
model.eval()
|
||||
langs = sorted(ltest_index.keys())
|
||||
predictions = {l: [] for l in langs}
|
||||
yte_stacked = {l: [] for l in langs}
|
||||
batcher.init_offset()
|
||||
for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte),
|
||||
desc='evaluation: '):
|
||||
logits = model(batch, post, bert_emb, lang)
|
||||
loss = criterion(logits, target).item()
|
||||
prediction = predict(logits)
|
||||
predictions[lang].append(prediction)
|
||||
yte_stacked[lang].append(target.detach().cpu().numpy())
|
||||
loss_history.append(loss)
|
||||
|
||||
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
|
||||
ly_ = {l:np.vstack(predictions[l]) for l in langs}
|
||||
l_eval = evaluate(ly, ly_)
|
||||
metrics = []
|
||||
for lang in langs:
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
if measure_prefix == 'te':
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
||||
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
||||
|
||||
mean_loss = np.mean(loss_history)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time.time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time.time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time.time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time.time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time.time() - tinit)
|
||||
|
||||
return Mf1
|
||||
|
||||
|
||||
def clip_gradient(model, clip_value=1e-1):
|
||||
params = list(filter(lambda p: p.grad is not None, model.parameters()))
|
||||
for p in params:
|
||||
p.grad.data.clamp_(-clip_value, clip_value)
|
||||
|
||||
|
||||
def init_logfile_nn(method_name, opt):
|
||||
logfile = CSVLog(opt.logfile_gru, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
|
||||
logfile.set_default('dataset', opt.dataset)
|
||||
logfile.set_default('run', opt.seed)
|
||||
logfile.set_default('method', method_name)
|
||||
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
|
||||
f'and run {opt.seed} already calculated'
|
||||
return logfile
|
144
src/main_gFun.py
144
src/main_gFun.py
|
@ -1,144 +0,0 @@
|
|||
import os
|
||||
from dataset_builder import MultilingualDataset
|
||||
from learning.transformers import *
|
||||
from util.evaluation import *
|
||||
from util.file import exists
|
||||
from util.results import PolylingualClassificationResults
|
||||
from util.common import *
|
||||
from util.parser_options import *
|
||||
|
||||
if __name__ == '__main__':
|
||||
(op, args) = parser.parse_args()
|
||||
dataset = op.dataset
|
||||
assert exists(dataset), 'Unable to find file '+str(dataset)
|
||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||
assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \
|
||||
'empty set of document embeddings is not allowed'
|
||||
assert (op.gruWCE or op.gruMUSE) and op.gruViewGenerator, 'Initializing Gated Recurrent embedding layer without ' \
|
||||
'explicit initialization of GRU View Generator'
|
||||
|
||||
l2 = op.l2
|
||||
dataset_file = os.path.basename(dataset)
|
||||
results = PolylingualClassificationResults('../log/' + op.output)
|
||||
allprob = 'Prob' if op.allprob else ''
|
||||
|
||||
# renaming arguments to be printed on log
|
||||
method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert,
|
||||
op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob)
|
||||
print(f'Method: gFun{method_name}\nDataset: {dataset_name}')
|
||||
print('-'*50)
|
||||
|
||||
# set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect
|
||||
standardize_range = slice(0, 0)
|
||||
if op.zscore:
|
||||
standardize_range = None
|
||||
|
||||
# load dataset
|
||||
data = MultilingualDataset.load(dataset)
|
||||
data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING
|
||||
data.show_dimensions()
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
# text preprocessing
|
||||
tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
|
||||
# feature weighting (for word embeddings average)
|
||||
feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
|
||||
|
||||
# document embedding modules aka View Generators
|
||||
doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat')
|
||||
|
||||
# init View Generators
|
||||
if op.posteriors:
|
||||
"""
|
||||
View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means
|
||||
of a set of SVM.
|
||||
"""
|
||||
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
|
||||
kernel='linear',
|
||||
C=op.set_c), l2=l2))
|
||||
|
||||
if op.supervised:
|
||||
"""
|
||||
View Generator (-W): generates document representation via Word-Class-Embeddings.
|
||||
Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
"""
|
||||
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||
if op.allprob:
|
||||
wce = FeatureSet2Posteriors(wce, requires_tfidf=True, l2=l2)
|
||||
doc_embedder.append(wce)
|
||||
|
||||
if op.pretrained:
|
||||
"""
|
||||
View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word
|
||||
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
"""
|
||||
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||
if op.allprob:
|
||||
muse = FeatureSet2Posteriors(muse, requires_tfidf=True, l2=l2)
|
||||
doc_embedder.append(muse)
|
||||
|
||||
if op.gruViewGenerator:
|
||||
"""
|
||||
View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be
|
||||
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Such
|
||||
document embeddings are then casted into vectors of posterior probabilities via a set of SVM.
|
||||
NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob
|
||||
"""
|
||||
op.gru_path = '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' # TODO DEBUG
|
||||
op.gru_path = None
|
||||
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
|
||||
options=op, model_path=op.gru_path, we_path=op.we_path)
|
||||
doc_embedder.append(rnn_embedder)
|
||||
|
||||
if op.mbert:
|
||||
"""
|
||||
View generator (-B): generates document embedding via mBERT model.
|
||||
"""
|
||||
op.bert_path = '/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-rcv1-2_run0' # TODO DEBUG
|
||||
mbert = MBertEmbedder(path_to_model=op.bert_path,
|
||||
nC=data.num_categories())
|
||||
if op.allprob:
|
||||
mbert = FeatureSet2Posteriors(mbert, l2=l2)
|
||||
doc_embedder.append(mbert)
|
||||
|
||||
# metaclassifier
|
||||
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c),
|
||||
meta_parameters=get_params(op.optimc), standardize_range=standardize_range)
|
||||
|
||||
# ensembling the modules
|
||||
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
|
||||
|
||||
print('\n# Fitting Funnelling Architecture...')
|
||||
tinit = time.time()
|
||||
classifier.fit(lXtr, lytr)
|
||||
time = time.time()-tinit
|
||||
|
||||
print('\n# Evaluating ...')
|
||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||
|
||||
metrics = []
|
||||
for lang in lXte.keys():
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
results.add_row(method='MultiModal',
|
||||
learner='SVM',
|
||||
optimp=op.optimc,
|
||||
sif=op.sif,
|
||||
zscore=op.zscore,
|
||||
l2=op.l2,
|
||||
wescaler=op.feat_weight,
|
||||
pca=op.max_labels_S,
|
||||
id=method_name,
|
||||
dataset=dataset_name,
|
||||
time=time,
|
||||
lang=lang,
|
||||
macrof1=macrof1,
|
||||
microf1=microf1,
|
||||
macrok=macrok,
|
||||
microk=microk,
|
||||
notes='')
|
||||
print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))
|
|
@ -1,42 +0,0 @@
|
|||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
import torch
|
||||
|
||||
class CNN_pdr(nn.Module):
|
||||
|
||||
def __init__(self, output_size, out_channels, compositional_dim, vocab_size, emb_dim, embeddings=None, drop_embedding_range=None,
|
||||
drop_embedding_prop=0, drop_prob=0.5):
|
||||
super(CNN_pdr, self).__init__()
|
||||
self.vocab_size = vocab_size
|
||||
self.emb_dim = emb_dim
|
||||
self.embeddings = torch.FloatTensor(embeddings)
|
||||
self.embedding_layer = nn.Embedding(vocab_size, emb_dim, _weight=self.embeddings)
|
||||
self.kernel_heights = kernel_heights=[3,5,7]
|
||||
self.stride = 1
|
||||
self.padding = 0
|
||||
self.drop_embedding_range = drop_embedding_range
|
||||
self.drop_embedding_prop = drop_embedding_prop
|
||||
assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
|
||||
self.nC = 73
|
||||
|
||||
self.conv1 = nn.Conv2d(1, compositional_dim, (self.kernel_heights[0], self.emb_dim), self.stride, self.padding)
|
||||
self.dropout = nn.Dropout(drop_prob)
|
||||
self.label = nn.Linear(len(kernel_heights) * out_channels, output_size)
|
||||
self.fC = nn.Linear(compositional_dim + self.nC, self.nC)
|
||||
|
||||
|
||||
def forward(self, x, svm_output):
|
||||
x = torch.LongTensor(x)
|
||||
svm_output = torch.FloatTensor(svm_output)
|
||||
x = self.embedding_layer(x)
|
||||
x = self.conv1(x.unsqueeze(1))
|
||||
x = F.relu(x.squeeze(3))
|
||||
x = F.max_pool1d(x, x.size()[2]).squeeze(2)
|
||||
x = torch.cat((x, svm_output), 1)
|
||||
x = F.sigmoid(self.fC(x))
|
||||
return x #.detach().numpy()
|
||||
|
||||
# logits = self.label(x)
|
||||
# return logits
|
||||
|
||||
|
|
@ -3,25 +3,29 @@ import torch.nn as nn
|
|||
from torch.nn import functional as F
|
||||
|
||||
|
||||
|
||||
def init_embeddings(pretrained, vocab_size, learnable_length, device='cuda'):
|
||||
def init_embeddings(pretrained, vocab_size, learnable_length):
|
||||
"""
|
||||
Compute the embedding matrix
|
||||
:param pretrained:
|
||||
:param vocab_size:
|
||||
:param learnable_length:
|
||||
:return:
|
||||
"""
|
||||
pretrained_embeddings = None
|
||||
pretrained_length = 0
|
||||
if pretrained is not None:
|
||||
pretrained_length = pretrained.shape[1]
|
||||
assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size'
|
||||
pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length)
|
||||
# requires_grad=False sets the embedding layer as NOT trainable
|
||||
pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False)
|
||||
# pretrained_embeddings.to(device)
|
||||
|
||||
learnable_embeddings = None
|
||||
if learnable_length > 0:
|
||||
learnable_embeddings = nn.Embedding(vocab_size, learnable_length)
|
||||
# learnable_embeddings.to(device)
|
||||
|
||||
embedding_length = learnable_length + pretrained_length
|
||||
assert embedding_length > 0, '0-size embeddings'
|
||||
|
||||
return pretrained_embeddings, learnable_embeddings, embedding_length
|
||||
|
||||
|
||||
|
|
|
@ -1,9 +1,24 @@
|
|||
import numpy as np
|
||||
import time
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.svm import SVC
|
||||
|
||||
from src.util.standardizer import StandardizeTransformer
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear', C=1):
|
||||
"""
|
||||
instantiate scikit Support Vector Classifier
|
||||
:param calibrate: boolean, whether to return posterior probabilities or not
|
||||
:param kernel: string,kernel to be applied to the SVC
|
||||
:param C: int or dict {'C': list of integer}, Regularization parameter
|
||||
:return: Support Vector Classifier
|
||||
"""
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False)
|
||||
|
||||
|
||||
def _sort_if_sparse(X):
|
||||
|
@ -13,7 +28,7 @@ def _sort_if_sparse(X):
|
|||
|
||||
def _joblib_transform_multiling(transformer, lX, n_jobs=-1):
|
||||
if n_jobs == 1:
|
||||
return {lang:transformer(lX[lang]) for lang in lX.keys()}
|
||||
return {lang: transformer(lX[lang]) for lang in lX.keys()}
|
||||
else:
|
||||
langs = list(lX.keys())
|
||||
transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs)
|
||||
|
@ -25,11 +40,11 @@ class TrivialRejector:
|
|||
self.cats = y.shape[1]
|
||||
return self
|
||||
|
||||
def decision_function(self, X): return np.zeros((X.shape[0],self.cats))
|
||||
def decision_function(self, X): return np.zeros((X.shape[0], self.cats))
|
||||
|
||||
def predict(self, X): return np.zeros((X.shape[0],self.cats))
|
||||
def predict(self, X): return np.zeros((X.shape[0], self.cats))
|
||||
|
||||
def predict_proba(self, X): return np.zeros((X.shape[0],self.cats))
|
||||
def predict_proba(self, X): return np.zeros((X.shape[0], self.cats))
|
||||
|
||||
def best_params(self): return {}
|
||||
|
||||
|
@ -38,6 +53,7 @@ class NaivePolylingualClassifier:
|
|||
"""
|
||||
Is a mere set of independet MonolingualClassifiers
|
||||
"""
|
||||
|
||||
def __init__(self, base_learner, parameters=None, n_jobs=-1):
|
||||
self.base_learner = base_learner
|
||||
self.parameters = parameters
|
||||
|
@ -58,10 +74,11 @@ class NaivePolylingualClassifier:
|
|||
_sort_if_sparse(lX[lang])
|
||||
|
||||
models = Parallel(n_jobs=self.n_jobs)\
|
||||
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
|
||||
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]), ly[lang]) for
|
||||
lang in langs)
|
||||
|
||||
self.model = {lang: models[i] for i, lang in enumerate(langs)}
|
||||
self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
|
||||
self.empty_categories = {lang: self.model[lang].empty_categories for lang in langs}
|
||||
self.time = time.time() - tinit
|
||||
return self
|
||||
|
||||
|
@ -72,9 +89,9 @@ class NaivePolylingualClassifier:
|
|||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
|
||||
langs=list(lX.keys())
|
||||
langs = list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs)
|
||||
return {lang:scores[i] for i,lang in enumerate(langs)}
|
||||
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def predict_proba(self, lX):
|
||||
"""
|
||||
|
@ -83,9 +100,10 @@ class NaivePolylingualClassifier:
|
|||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
|
||||
langs=list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs)
|
||||
return {lang:scores[i] for i,lang in enumerate(langs)}
|
||||
langs = list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(
|
||||
delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs)
|
||||
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def predict(self, lX):
|
||||
"""
|
||||
|
@ -95,14 +113,14 @@ class NaivePolylingualClassifier:
|
|||
assert self.model is not None, 'predict called before fit'
|
||||
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict'
|
||||
if self.n_jobs == 1:
|
||||
return {lang:self.model[lang].transform(lX[lang]) for lang in lX.keys()}
|
||||
return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()}
|
||||
else:
|
||||
langs = list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs)
|
||||
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def best_params(self):
|
||||
return {l:model.best_params() for l,model in self.model.items()}
|
||||
return {lang: model.best_params() for lang, model in self.model.items()}
|
||||
|
||||
|
||||
class MonolingualClassifier:
|
||||
|
@ -117,14 +135,13 @@ class MonolingualClassifier:
|
|||
def fit(self, X, y):
|
||||
if X.shape[0] == 0:
|
||||
print('Warning: X has 0 elements, a trivial rejector will be created')
|
||||
self.model = TrivialRejector().fit(X,y)
|
||||
self.model = TrivialRejector().fit(X, y)
|
||||
self.empty_categories = np.arange(y.shape[1])
|
||||
return self
|
||||
|
||||
tinit = time.time()
|
||||
_sort_if_sparse(X)
|
||||
self.empty_categories = np.argwhere(np.sum(y, axis=0)==0).flatten()
|
||||
|
||||
self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten()
|
||||
# multi-class format
|
||||
if len(y.shape) == 2:
|
||||
if self.parameters is not None:
|
||||
|
@ -142,13 +159,12 @@ class MonolingualClassifier:
|
|||
self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs,
|
||||
error_score=0, verbose=10)
|
||||
|
||||
# print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}')
|
||||
print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}')
|
||||
self.model.fit(X, y)
|
||||
if isinstance(self.model, GridSearchCV):
|
||||
self.best_params_ = self.model.best_params_
|
||||
print('best parameters: ', self.best_params_)
|
||||
self.time=time.time()-tinit
|
||||
self.time = time.time() - tinit
|
||||
return self
|
||||
|
||||
def decision_function(self, X):
|
||||
|
@ -168,4 +184,41 @@ class MonolingualClassifier:
|
|||
return self.model.predict(X)
|
||||
|
||||
def best_params(self):
|
||||
return self.best_params_
|
||||
return self.best_params_
|
||||
|
||||
|
||||
class MetaClassifier:
|
||||
|
||||
def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None):
|
||||
self.n_jobs = n_jobs
|
||||
self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
|
||||
self.standardize_range = standardize_range
|
||||
|
||||
def fit(self, lZ, ly):
|
||||
tinit = time.time()
|
||||
Z, y = self.stack(lZ, ly)
|
||||
|
||||
self.standardizer = StandardizeTransformer(range=self.standardize_range)
|
||||
Z = self.standardizer.fit_transform(Z)
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(Z.shape))
|
||||
self.model.fit(Z, y)
|
||||
self.time = time.time() - tinit
|
||||
|
||||
def stack(self, lZ, ly=None):
|
||||
langs = list(lZ.keys())
|
||||
Z = np.vstack([lZ[lang] for lang in langs])
|
||||
if ly is not None:
|
||||
y = np.vstack([ly[lang] for lang in langs])
|
||||
return Z, y
|
||||
else:
|
||||
return Z
|
||||
|
||||
def predict(self, lZ):
|
||||
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
|
||||
def predict_proba(self, lZ):
|
||||
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
|
||||
return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs)
|
||||
|
|
@ -1,8 +1,6 @@
|
|||
#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.autograd import Variable
|
||||
from models.helpers import *
|
||||
from torch.autograd import Variable
|
||||
|
||||
|
||||
class RNNMultilingualClassifier(nn.Module):
|
||||
|
|
|
@ -1,249 +0,0 @@
|
|||
from copy import deepcopy
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig
|
||||
from sklearn.model_selection import train_test_split
|
||||
from util.evaluation import *
|
||||
from time import time
|
||||
|
||||
|
||||
def predict(logits, classification_type='multilabel'):
|
||||
if classification_type == 'multilabel':
|
||||
prediction = torch.sigmoid(logits) > 0.5
|
||||
elif classification_type == 'singlelabel':
|
||||
prediction = torch.argmax(logits, dim=1).view(-1, 1)
|
||||
else:
|
||||
print('unknown classification type')
|
||||
|
||||
return prediction.detach().cpu().numpy()
|
||||
|
||||
|
||||
class TrainingDataset(Dataset):
|
||||
"""
|
||||
data: dict of lang specific tokenized data
|
||||
labels: dict of lang specific targets
|
||||
"""
|
||||
|
||||
def __init__(self, data, labels):
|
||||
self.langs = data.keys()
|
||||
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
|
||||
|
||||
for i, lang in enumerate(self.langs):
|
||||
_data = data[lang]['input_ids']
|
||||
_data = np.array(_data)
|
||||
_labels = labels[lang]
|
||||
_lang_value = np.full(len(_data), self.lang_ids[lang])
|
||||
|
||||
if i == 0:
|
||||
self.data = _data
|
||||
self.labels = _labels
|
||||
self.lang_index = _lang_value
|
||||
else:
|
||||
self.data = np.vstack((self.data, _data))
|
||||
self.labels = np.vstack((self.labels, _labels))
|
||||
self.lang_index = np.concatenate((self.lang_index, _lang_value))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
x = self.data[idx]
|
||||
y = self.labels[idx]
|
||||
lang = self.lang_index[idx]
|
||||
|
||||
return x, torch.tensor(y, dtype=torch.float), lang
|
||||
|
||||
def get_lang_ids(self):
|
||||
return self.lang_ids
|
||||
|
||||
def get_nclasses(self):
|
||||
if hasattr(self, 'labels'):
|
||||
return len(self.labels[0])
|
||||
else:
|
||||
print('Method called before init!')
|
||||
|
||||
|
||||
class ExtractorDataset(Dataset):
|
||||
"""
|
||||
data: dict of lang specific tokenized data
|
||||
labels: dict of lang specific targets
|
||||
"""
|
||||
|
||||
def __init__(self, data):
|
||||
self.langs = data.keys()
|
||||
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
|
||||
|
||||
for i, lang in enumerate(self.langs):
|
||||
_data = data[lang]['input_ids']
|
||||
_data = np.array(_data)
|
||||
_lang_value = np.full(len(_data), self.lang_ids[lang])
|
||||
|
||||
if i == 0:
|
||||
self.data = _data
|
||||
self.lang_index = _lang_value
|
||||
else:
|
||||
self.data = np.vstack((self.data, _data))
|
||||
self.lang_index = np.concatenate((self.lang_index, _lang_value))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
x = self.data[idx]
|
||||
lang = self.lang_index[idx]
|
||||
|
||||
return x, lang
|
||||
|
||||
def get_lang_ids(self):
|
||||
return self.lang_ids
|
||||
|
||||
|
||||
def get_model(n_out):
|
||||
print('# Initializing model ...')
|
||||
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
|
||||
return model
|
||||
|
||||
|
||||
def init_optimizer(model, lr, weight_decay=0):
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in model.named_parameters()
|
||||
if not any(nd in n for nd in no_decay)],
|
||||
'weight_decay': weight_decay},
|
||||
{'params': [p for n, p in model.named_parameters()
|
||||
if any(nd in n for nd in no_decay)],
|
||||
'weight_decay': weight_decay}
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
|
||||
return optimizer
|
||||
|
||||
|
||||
def get_lr(optimizer):
|
||||
for param_group in optimizer.param_groups:
|
||||
return param_group['lr']
|
||||
|
||||
|
||||
def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
|
||||
l_split_va = deepcopy(l_tokenized_tr)
|
||||
l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
|
||||
l_split_tr = deepcopy(l_tokenized_tr)
|
||||
l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
|
||||
|
||||
for lang in l_tokenized_tr.keys():
|
||||
val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
|
||||
l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[
|
||||
lang] = \
|
||||
train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size,
|
||||
random_state=seed, shuffle=True)
|
||||
|
||||
return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
|
||||
|
||||
|
||||
def do_tokenization(l_dataset, max_len=512, verbose=True):
|
||||
if verbose:
|
||||
print('# Starting Tokenization ...')
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
||||
langs = l_dataset.keys()
|
||||
l_tokenized = {}
|
||||
for lang in langs:
|
||||
l_tokenized[lang] = tokenizer(l_dataset[lang],
|
||||
truncation=True,
|
||||
max_length=max_len,
|
||||
padding='max_length')
|
||||
return l_tokenized
|
||||
|
||||
|
||||
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10):
|
||||
# _dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
# dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
dataset_id = 'TODO fix this!'
|
||||
|
||||
loss_history = []
|
||||
model.train()
|
||||
|
||||
for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
|
||||
optim.zero_grad()
|
||||
out = model(batch.cuda())
|
||||
logits = out[0]
|
||||
loss = criterion(logits, target.cuda())
|
||||
loss.backward()
|
||||
# clip_gradient(model)
|
||||
optim.step()
|
||||
loss_history.append(loss.item())
|
||||
|
||||
if idx % log_interval == 0:
|
||||
interval_loss = np.mean(loss_history[log_interval:])
|
||||
print(
|
||||
f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
|
||||
|
||||
mean_loss = np.mean(interval_loss)
|
||||
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
|
||||
return mean_loss
|
||||
|
||||
|
||||
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix):
|
||||
print('# Validating model ...')
|
||||
loss_history = []
|
||||
model.eval()
|
||||
langs = lang_ids.keys()
|
||||
id_2_lang = {v: k for k, v in lang_ids.items()}
|
||||
predictions = {l: [] for l in langs}
|
||||
yte_stacked = {l: [] for l in langs}
|
||||
|
||||
for batch, target, lang_idx in test_dataloader:
|
||||
out = model(batch.cuda())
|
||||
logits = out[0]
|
||||
loss = criterion(logits, target.cuda()).item()
|
||||
prediction = predict(logits)
|
||||
loss_history.append(loss)
|
||||
|
||||
# Assigning prediction to dict in predictions and yte_stacked according to lang_idx
|
||||
for i, pred in enumerate(prediction):
|
||||
lang_pred = id_2_lang[lang_idx.numpy()[i]]
|
||||
predictions[lang_pred].append(pred)
|
||||
yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
|
||||
|
||||
ly = {l: np.vstack(yte_stacked[l]) for l in langs}
|
||||
ly_ = {l: np.vstack(predictions[l]) for l in langs}
|
||||
l_eval = evaluate(ly, ly_)
|
||||
metrics = []
|
||||
for lang in langs:
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
if measure_prefix == 'te':
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
||||
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
||||
|
||||
mean_loss = np.mean(loss_history)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
|
||||
|
||||
return Mf1
|
||||
|
||||
|
||||
def feature_extractor(data, lang_ids, model):
|
||||
print('# Feature Extractor Mode...')
|
||||
"""
|
||||
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
|
||||
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
|
||||
"""
|
||||
all_batch_embeddings = {}
|
||||
id2lang = {v: k for k, v in lang_ids.items()}
|
||||
with torch.no_grad():
|
||||
for batch, lang_idx in data:
|
||||
# for batch, target, lang_idx in data:
|
||||
out = model(batch.cuda())
|
||||
last_hidden_state = out[1][-1]
|
||||
batch_embeddings = last_hidden_state[:, 0, :]
|
||||
for i, l_idx in enumerate(lang_idx.numpy()):
|
||||
if id2lang[l_idx] not in all_batch_embeddings.keys():
|
||||
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
|
||||
else:
|
||||
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
|
||||
batch_embeddings[i].detach().cpu().numpy()))
|
||||
|
||||
return all_batch_embeddings, id2lang
|
|
@ -0,0 +1,188 @@
|
|||
import pytorch_lightning as pl
|
||||
import torch
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
from transformers import BertForSequenceClassification, AdamW
|
||||
|
||||
from src.util.common import define_pad_length, pad
|
||||
from src.util.pl_metrics import CustomF1, CustomK
|
||||
|
||||
|
||||
class BertModel(pl.LightningModule):
|
||||
|
||||
def __init__(self, output_size, stored_path, gpus=None):
|
||||
"""
|
||||
Init Bert model.
|
||||
:param output_size:
|
||||
:param stored_path:
|
||||
:param gpus:
|
||||
"""
|
||||
super().__init__()
|
||||
self.loss = torch.nn.BCEWithLogitsLoss()
|
||||
self.gpus = gpus
|
||||
self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
|
||||
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
|
||||
# Language specific metrics to compute metrics at epoch level
|
||||
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||
self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||
|
||||
if stored_path:
|
||||
self.bert = BertForSequenceClassification.from_pretrained(stored_path,
|
||||
num_labels=output_size,
|
||||
output_hidden_states=True)
|
||||
else:
|
||||
self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased',
|
||||
num_labels=output_size,
|
||||
output_hidden_states=True)
|
||||
self.save_hyperparameters()
|
||||
|
||||
def forward(self, X):
|
||||
logits = self.bert(X)
|
||||
return logits
|
||||
|
||||
def training_step(self, train_batch, batch_idx):
|
||||
X, y, _, batch_langs = train_batch
|
||||
X = torch.cat(X).view([X[0].shape[0], len(X)])
|
||||
y = y.type(torch.FloatTensor)
|
||||
y = y.to('cuda' if self.gpus else 'cpu')
|
||||
logits, _ = self.forward(X)
|
||||
loss = self.loss(logits, y)
|
||||
# Squashing logits through Sigmoid in order to get confidence score
|
||||
predictions = torch.sigmoid(logits) > 0.5
|
||||
microF1 = self.microF1(predictions, y)
|
||||
macroF1 = self.macroF1(predictions, y)
|
||||
microK = self.microK(predictions, y)
|
||||
macroK = self.macroK(predictions, y)
|
||||
self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
lX, ly = self._reconstruct_dict(predictions, y, batch_langs)
|
||||
return {'loss': loss, 'pred': lX, 'target': ly}
|
||||
|
||||
def training_epoch_end(self, outputs):
|
||||
langs = []
|
||||
for output in outputs:
|
||||
langs.extend(list(output['pred'].keys()))
|
||||
langs = set(langs)
|
||||
# outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
|
||||
# here we save epoch level metric values and compute them specifically for each language
|
||||
res_macroF1 = {lang: [] for lang in langs}
|
||||
res_microF1 = {lang: [] for lang in langs}
|
||||
res_macroK = {lang: [] for lang in langs}
|
||||
res_microK = {lang: [] for lang in langs}
|
||||
for output in outputs:
|
||||
lX, ly = output['pred'], output['target']
|
||||
for lang in lX.keys():
|
||||
X, y = lX[lang], ly[lang]
|
||||
lang_macroF1 = self.lang_macroF1(X, y)
|
||||
lang_microF1 = self.lang_microF1(X, y)
|
||||
lang_macroK = self.lang_macroK(X, y)
|
||||
lang_microK = self.lang_microK(X, y)
|
||||
|
||||
res_macroF1[lang].append(lang_macroF1)
|
||||
res_microF1[lang].append(lang_microF1)
|
||||
res_macroK[lang].append(lang_macroK)
|
||||
res_microK[lang].append(lang_microK)
|
||||
for lang in langs:
|
||||
avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang]))
|
||||
avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang]))
|
||||
avg_macroK = torch.mean(torch.Tensor(res_macroK[lang]))
|
||||
avg_microK = torch.mean(torch.Tensor(res_microK[lang]))
|
||||
self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch)
|
||||
self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch)
|
||||
self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch)
|
||||
self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch)
|
||||
|
||||
def validation_step(self, val_batch, batch_idx):
|
||||
X, y, _, batch_langs = val_batch
|
||||
X = torch.cat(X).view([X[0].shape[0], len(X)])
|
||||
y = y.type(torch.FloatTensor)
|
||||
y = y.to('cuda' if self.gpus else 'cpu')
|
||||
logits, _ = self.forward(X)
|
||||
loss = self.loss(logits, y)
|
||||
predictions = torch.sigmoid(logits) > 0.5
|
||||
microF1 = self.microF1(predictions, y)
|
||||
macroF1 = self.macroF1(predictions, y)
|
||||
microK = self.microK(predictions, y)
|
||||
macroK = self.macroK(predictions, y)
|
||||
self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
return {'loss': loss}
|
||||
|
||||
def test_step(self, test_batch, batch_idx):
|
||||
X, y, _, batch_langs = test_batch
|
||||
X = torch.cat(X).view([X[0].shape[0], len(X)])
|
||||
y = y.type(torch.FloatTensor)
|
||||
y = y.to('cuda' if self.gpus else 'cpu')
|
||||
logits, _ = self.forward(X)
|
||||
loss = self.loss(logits, y)
|
||||
# Squashing logits through Sigmoid in order to get confidence score
|
||||
predictions = torch.sigmoid(logits) > 0.5
|
||||
microF1 = self.microF1(predictions, y)
|
||||
macroF1 = self.macroF1(predictions, y)
|
||||
microK = self.microK(predictions, y)
|
||||
macroK = self.macroK(predictions, y)
|
||||
self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
return
|
||||
|
||||
def configure_optimizers(self, lr=3e-5, weight_decay=0.01):
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in self.bert.named_parameters()
|
||||
if not any(nd in n for nd in no_decay)],
|
||||
'weight_decay': weight_decay},
|
||||
{'params': [p for n, p in self.bert.named_parameters()
|
||||
if any(nd in n for nd in no_decay)],
|
||||
'weight_decay': weight_decay}
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
|
||||
scheduler = StepLR(optimizer, step_size=25, gamma=0.1)
|
||||
return [optimizer], [scheduler]
|
||||
|
||||
def encode(self, lX, batch_size=64):
|
||||
with torch.no_grad():
|
||||
l_embed = {lang: [] for lang in lX.keys()}
|
||||
for lang in sorted(lX.keys()):
|
||||
for i in range(0, len(lX[lang]), batch_size):
|
||||
if i + batch_size > len(lX[lang]):
|
||||
batch = lX[lang][i:len(lX[lang])]
|
||||
else:
|
||||
batch = lX[lang][i:i + batch_size]
|
||||
max_pad_len = define_pad_length(batch)
|
||||
batch = pad(batch, pad_index=self.bert.config.pad_token_id, max_pad_length=max_pad_len)
|
||||
batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu')
|
||||
_, output = self.forward(batch)
|
||||
|
||||
# deleting batch from gpu to avoid cuda OOM
|
||||
del batch
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
doc_embeds = output[-1][:, 0, :]
|
||||
l_embed[lang].append(doc_embeds.cpu())
|
||||
for k, v in l_embed.items():
|
||||
l_embed[k] = torch.cat(v, dim=0).numpy()
|
||||
return l_embed
|
||||
|
||||
@staticmethod
|
||||
def _reconstruct_dict(predictions, y, batch_langs):
|
||||
reconstructed_x = {lang: [] for lang in set(batch_langs)}
|
||||
reconstructed_y = {lang: [] for lang in set(batch_langs)}
|
||||
for i, pred in enumerate(predictions):
|
||||
reconstructed_x[batch_langs[i]].append(pred)
|
||||
reconstructed_y[batch_langs[i]].append(y[i])
|
||||
for k, v in reconstructed_x.items():
|
||||
reconstructed_x[k] = torch.cat(v).view(-1, predictions.shape[1])
|
||||
for k, v in reconstructed_y.items():
|
||||
reconstructed_y[k] = torch.cat(v).view(-1, predictions.shape[1])
|
||||
return reconstructed_x, reconstructed_y
|
|
@ -0,0 +1,266 @@
|
|||
# Lightning modules, see https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from torch.autograd import Variable
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
from transformers import AdamW
|
||||
|
||||
from src.models.helpers import init_embeddings
|
||||
from src.util.common import define_pad_length, pad
|
||||
from src.util.pl_metrics import CustomF1, CustomK
|
||||
|
||||
|
||||
class RecurrentModel(pl.LightningModule):
|
||||
def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length,
|
||||
drop_embedding_range, drop_embedding_prop, gpus=None):
|
||||
"""
|
||||
Init RNN model.
|
||||
:param lPretrained:
|
||||
:param langs:
|
||||
:param output_size:
|
||||
:param hidden_size:
|
||||
:param lVocab_size:
|
||||
:param learnable_length:
|
||||
:param drop_embedding_range:
|
||||
:param drop_embedding_prop:
|
||||
:param gpus:
|
||||
"""
|
||||
super().__init__()
|
||||
self.gpus = gpus
|
||||
self.langs = langs
|
||||
self.lVocab_size = lVocab_size
|
||||
self.learnable_length = learnable_length
|
||||
self.output_size = output_size
|
||||
self.hidden_size = hidden_size
|
||||
self.drop_embedding_range = drop_embedding_range
|
||||
self.drop_embedding_prop = drop_embedding_prop
|
||||
self.loss = torch.nn.BCEWithLogitsLoss()
|
||||
|
||||
self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
|
||||
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
|
||||
# Language specific metrics to compute metrics at epoch level
|
||||
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||
self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||
|
||||
self.lPretrained_embeddings = nn.ModuleDict()
|
||||
self.lLearnable_embeddings = nn.ModuleDict()
|
||||
|
||||
self.n_layers = 1
|
||||
self.n_directions = 1
|
||||
self.dropout = nn.Dropout(0.6)
|
||||
|
||||
lstm_out = 256
|
||||
ff1 = 512
|
||||
ff2 = 256
|
||||
|
||||
lpretrained_embeddings = {}
|
||||
llearnable_embeddings = {}
|
||||
|
||||
for lang in self.langs:
|
||||
pretrained = lPretrained[lang] if lPretrained else None
|
||||
pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings(
|
||||
pretrained, self.lVocab_size[lang], self.learnable_length)
|
||||
lpretrained_embeddings[lang] = pretrained_embeddings
|
||||
llearnable_embeddings[lang] = learnable_embeddings
|
||||
self.embedding_length = embedding_length
|
||||
|
||||
self.lPretrained_embeddings.update(lpretrained_embeddings)
|
||||
self.lLearnable_embeddings.update(llearnable_embeddings)
|
||||
|
||||
self.rnn = nn.GRU(self.embedding_length, hidden_size)
|
||||
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
|
||||
self.linear1 = nn.Linear(lstm_out, ff1)
|
||||
self.linear2 = nn.Linear(ff1, ff2)
|
||||
self.label = nn.Linear(ff2, self.output_size)
|
||||
|
||||
# TODO: setting lPretrained to None, letting it to its original value will "bug" first validation
|
||||
# step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow)
|
||||
lPretrained = None
|
||||
self.save_hyperparameters()
|
||||
|
||||
def forward(self, lX):
|
||||
l_embed = []
|
||||
for lang in sorted(lX.keys()):
|
||||
doc_embedding = self.transform(lX[lang], lang)
|
||||
l_embed.append(doc_embedding)
|
||||
embed = torch.cat(l_embed, dim=0)
|
||||
logits = self.label(embed)
|
||||
return logits
|
||||
|
||||
def transform(self, X, lang):
|
||||
batch_size = X.shape[0]
|
||||
X = self.embed(X, lang)
|
||||
X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
|
||||
training=self.training)
|
||||
X = X.permute(1, 0, 2)
|
||||
h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).to(self.device))
|
||||
output, _ = self.rnn(X, h_0)
|
||||
output = output[-1, :, :]
|
||||
output = F.relu(self.linear0(output))
|
||||
output = self.dropout(F.relu(self.linear1(output)))
|
||||
output = self.dropout(F.relu(self.linear2(output)))
|
||||
return output
|
||||
|
||||
def encode(self, lX, l_pad, batch_size=128):
|
||||
"""
|
||||
Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512.
|
||||
:param lX:
|
||||
:param l_pad:
|
||||
:param batch_size:
|
||||
:return:
|
||||
"""
|
||||
with torch.no_grad():
|
||||
l_embed = {lang: [] for lang in lX.keys()}
|
||||
for lang in sorted(lX.keys()):
|
||||
for i in range(0, len(lX[lang]), batch_size):
|
||||
if i+batch_size > len(lX[lang]):
|
||||
batch = lX[lang][i:len(lX[lang])]
|
||||
else:
|
||||
batch = lX[lang][i:i+batch_size]
|
||||
max_pad_len = define_pad_length(batch)
|
||||
batch = pad(batch, pad_index=l_pad[lang], max_pad_length=max_pad_len)
|
||||
X = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu')
|
||||
_batch_size = X.shape[0]
|
||||
X = self.embed(X, lang)
|
||||
X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
|
||||
training=self.training)
|
||||
X = X.permute(1, 0, 2)
|
||||
h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, _batch_size, self.hidden_size).to(self.device))
|
||||
output, _ = self.rnn(X, h_0)
|
||||
output = output[-1, :, :]
|
||||
output = F.relu(self.linear0(output))
|
||||
output = self.dropout(F.relu(self.linear1(output)))
|
||||
l_embed[lang].append(output.cpu())
|
||||
for k, v in l_embed.items():
|
||||
l_embed[k] = torch.cat(v, dim=0).numpy()
|
||||
return l_embed
|
||||
|
||||
def training_step(self, train_batch, batch_idx):
|
||||
lX, ly = train_batch
|
||||
logits = self.forward(lX)
|
||||
_ly = []
|
||||
for lang in sorted(lX.keys()):
|
||||
_ly.append(ly[lang])
|
||||
y = torch.cat(_ly, dim=0)
|
||||
loss = self.loss(logits, y)
|
||||
# Squashing logits through Sigmoid in order to get confidence score
|
||||
predictions = torch.sigmoid(logits) > 0.5
|
||||
microF1 = self.microF1(predictions, y)
|
||||
macroF1 = self.macroF1(predictions, y)
|
||||
microK = self.microK(predictions, y)
|
||||
macroK = self.macroK(predictions, y)
|
||||
self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
re_lX = self._reconstruct_dict(predictions, ly)
|
||||
return {'loss': loss, 'pred': re_lX, 'target': ly}
|
||||
|
||||
def training_epoch_end(self, outputs):
|
||||
# outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
|
||||
# here we save epoch level metric values and compute them specifically for each language
|
||||
res_macroF1 = {lang: [] for lang in self.langs}
|
||||
res_microF1 = {lang: [] for lang in self.langs}
|
||||
res_macroK = {lang: [] for lang in self.langs}
|
||||
res_microK = {lang: [] for lang in self.langs}
|
||||
for output in outputs:
|
||||
lX, ly = output['pred'], output['target']
|
||||
for lang in lX.keys():
|
||||
X, y = lX[lang], ly[lang]
|
||||
lang_macroF1 = self.lang_macroF1(X, y)
|
||||
lang_microF1 = self.lang_microF1(X, y)
|
||||
lang_macroK = self.lang_macroK(X, y)
|
||||
lang_microK = self.lang_microK(X, y)
|
||||
|
||||
res_macroF1[lang].append(lang_macroF1)
|
||||
res_microF1[lang].append(lang_microF1)
|
||||
res_macroK[lang].append(lang_macroK)
|
||||
res_microK[lang].append(lang_microK)
|
||||
for lang in self.langs:
|
||||
avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang]))
|
||||
avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang]))
|
||||
avg_macroK = torch.mean(torch.Tensor(res_macroK[lang]))
|
||||
avg_microK = torch.mean(torch.Tensor(res_microK[lang]))
|
||||
self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch)
|
||||
self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch)
|
||||
self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch)
|
||||
self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch)
|
||||
|
||||
def validation_step(self, val_batch, batch_idx):
|
||||
lX, ly = val_batch
|
||||
logits = self.forward(lX)
|
||||
_ly = []
|
||||
for lang in sorted(lX.keys()):
|
||||
_ly.append(ly[lang])
|
||||
ly = torch.cat(_ly, dim=0)
|
||||
loss = self.loss(logits, ly)
|
||||
predictions = torch.sigmoid(logits) > 0.5
|
||||
microF1 = self.microF1(predictions, ly)
|
||||
macroF1 = self.macroF1(predictions, ly)
|
||||
microK = self.microK(predictions, ly)
|
||||
macroK = self.macroK(predictions, ly)
|
||||
self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
return {'loss': loss}
|
||||
|
||||
def test_step(self, test_batch, batch_idx):
|
||||
lX, ly = test_batch
|
||||
logits = self.forward(lX)
|
||||
_ly = []
|
||||
for lang in sorted(lX.keys()):
|
||||
_ly.append(ly[lang])
|
||||
ly = torch.cat(_ly, dim=0)
|
||||
predictions = torch.sigmoid(logits) > 0.5
|
||||
microF1 = self.microF1(predictions, ly)
|
||||
macroF1 = self.macroF1(predictions, ly)
|
||||
microK = self.microK(predictions, ly)
|
||||
macroK = self.macroK(predictions, ly)
|
||||
self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
return
|
||||
|
||||
def embed(self, X, lang):
|
||||
input_list = []
|
||||
if self.lPretrained_embeddings[lang]:
|
||||
input_list.append(self.lPretrained_embeddings[lang](X))
|
||||
if self.lLearnable_embeddings[lang]:
|
||||
input_list.append(self.lLearnable_embeddings[lang](X))
|
||||
return torch.cat(tensors=input_list, dim=2)
|
||||
|
||||
def embedding_dropout(self, X, drop_range, p_drop=0.5, training=True):
|
||||
if p_drop > 0 and training and drop_range is not None:
|
||||
p = p_drop
|
||||
drop_from, drop_to = drop_range
|
||||
m = drop_to - drop_from # length of the supervised embedding
|
||||
l = X.shape[2] # total embedding length
|
||||
corr = (1 - p)
|
||||
X[:, :, drop_from:drop_to] = corr * F.dropout(X[:, :, drop_from:drop_to], p=p)
|
||||
X /= (1 - (p * m / l))
|
||||
return X
|
||||
|
||||
def configure_optimizers(self):
|
||||
optimizer = AdamW(self.parameters(), lr=1e-3)
|
||||
scheduler = StepLR(optimizer, step_size=25, gamma=0.5)
|
||||
return [optimizer], [scheduler]
|
||||
|
||||
@staticmethod
|
||||
def _reconstruct_dict(X, ly):
|
||||
reconstructed = {}
|
||||
_start = 0
|
||||
for lang in sorted(ly.keys()):
|
||||
lang_batchsize = len(ly[lang])
|
||||
reconstructed[lang] = X[_start:_start+lang_batchsize]
|
||||
_start += lang_batchsize
|
||||
return reconstructed
|
|
@ -1,11 +0,0 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t')
|
||||
df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t')
|
||||
pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std])
|
||||
with pd.option_context('display.max_rows', None):
|
||||
print(pivot.round(3))
|
||||
print('Finished ...')
|
||||
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
import numpy as np
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
|
||||
|
||||
def get_weighted_average(We, x, w):
|
||||
"""
|
||||
Compute the weighted average vectors
|
||||
|
@ -15,6 +16,7 @@ def get_weighted_average(We, x, w):
|
|||
emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:])
|
||||
return emb
|
||||
|
||||
|
||||
def compute_pc(X,npc=1):
|
||||
"""
|
||||
Compute the principal components.
|
||||
|
@ -26,6 +28,7 @@ def compute_pc(X,npc=1):
|
|||
svd.fit(X)
|
||||
return svd.components_
|
||||
|
||||
|
||||
def remove_pc(X, npc=1):
|
||||
"""
|
||||
Remove the projection on the principal components
|
||||
|
@ -34,7 +37,7 @@ def remove_pc(X, npc=1):
|
|||
:return: XX[i, :] is the data point after removing its projection
|
||||
"""
|
||||
pc = compute_pc(X, npc)
|
||||
if npc==1:
|
||||
if npc == 1:
|
||||
XX = X - X.dot(pc.transpose()) * pc
|
||||
else:
|
||||
XX = X - X.dot(pc.transpose()).dot(pc)
|
||||
|
|
|
@ -1,12 +1,4 @@
|
|||
import subprocess
|
||||
import warnings
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.model_selection import train_test_split
|
||||
from embeddings.supervised import get_supervised_embeddings
|
||||
# from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
|
||||
|
@ -107,201 +99,101 @@ class Index:
|
|||
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True
|
||||
)
|
||||
|
||||
print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
|
||||
from src.util.embeddings_manager import supervised_embeddings_tfidf
|
||||
|
||||
def get_word_list(self):
|
||||
def extract_word_list(word2index):
|
||||
return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])]
|
||||
|
||||
word_list = extract_word_list(self.word2index)
|
||||
word_list += extract_word_list(self.out_of_vocabulary)
|
||||
return word_list
|
||||
class TfidfVectorizerMultilingual:
|
||||
|
||||
def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None):
|
||||
print(f'[generating embedding matrix for lang {self.lang}]')
|
||||
def __init__(self, **kwargs):
|
||||
self.kwargs = kwargs
|
||||
|
||||
self.wce_range = None
|
||||
embedding_parts = []
|
||||
def fit(self, lX, ly=None):
|
||||
self.langs = sorted(lX.keys())
|
||||
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
|
||||
return self
|
||||
|
||||
if pretrained is not None:
|
||||
print('\t[pretrained-matrix]')
|
||||
word_list = self.get_word_list()
|
||||
muse_embeddings = pretrained.extract(word_list)
|
||||
embedding_parts.append(muse_embeddings)
|
||||
del pretrained
|
||||
def transform(self, lX):
|
||||
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
|
||||
|
||||
if supervised:
|
||||
print('\t[supervised-matrix]')
|
||||
F = get_supervised_embeddings(Xtr, Ytr, reduction=None, method='dotn')
|
||||
num_missing_rows = self.vocabsize - F.shape[0]
|
||||
F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1]))))
|
||||
F = torch.from_numpy(F).float()
|
||||
def fit_transform(self, lX, ly=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
offset = 0
|
||||
if embedding_parts:
|
||||
offset = embedding_parts[0].shape[1]
|
||||
self.wce_range = [offset, offset + F.shape[1]]
|
||||
def vocabulary(self, l=None):
|
||||
if l is None:
|
||||
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].vocabulary_
|
||||
|
||||
embedding_parts.append(F)
|
||||
def get_analyzer(self, l=None):
|
||||
if l is None:
|
||||
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].build_analyzer()
|
||||
|
||||
make_dumps = False
|
||||
if make_dumps:
|
||||
print(f'Dumping Embedding Matrices ...')
|
||||
import pickle
|
||||
with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile:
|
||||
pickle.dump((self.lang, embedding_parts, self.word2index), outfile)
|
||||
with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2:
|
||||
pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2)
|
||||
|
||||
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
|
||||
|
||||
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
|
||||
def _normalize(lX, l2=True):
|
||||
return {lang: normalize(X) for lang, X in lX.items()} if l2 else lX
|
||||
|
||||
|
||||
def none_dict(langs):
|
||||
return {l:None for l in langs}
|
||||
return {l: None for l in langs}
|
||||
|
||||
|
||||
class MultilingualIndex:
|
||||
def __init__(self): #, add_language_trace=False):
|
||||
def __init__(self):
|
||||
"""
|
||||
Class that contains monolingual Indexes
|
||||
"""
|
||||
self.l_index = {}
|
||||
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
# self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000)
|
||||
# self.add_language_trace=add_language_trace}
|
||||
|
||||
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
|
||||
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_test_target, l_pretrained_vocabulary=None):
|
||||
self.langs = sorted(l_devel_raw.keys())
|
||||
|
||||
#build the vocabularies
|
||||
self.l_vectorizer.fit(l_devel_raw)
|
||||
l_vocabulary = self.l_vectorizer.vocabulary()
|
||||
l_analyzer = self.l_vectorizer.get_analyzer()
|
||||
if l_pretrained_vocabulary is None:
|
||||
l_pretrained_vocabulary = none_dict(self.langs)
|
||||
|
||||
for l in self.langs:
|
||||
self.l_index[l] = Index(l_devel_raw[l], l_devel_target[l], l_test_raw[l], l)
|
||||
self.l_index[l].index(l_pretrained_vocabulary[l], l_analyzer[l], l_vocabulary[l])
|
||||
for lang in self.langs:
|
||||
# Init monolingual Index
|
||||
self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang],
|
||||
lang)
|
||||
# call to index() function of monolingual Index
|
||||
self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang])
|
||||
|
||||
def train_val_split(self, val_prop=0.2, max_val=2000, seed=42):
|
||||
for l,index in self.l_index.items():
|
||||
for l, index in self.l_index.items():
|
||||
index.train_val_split(val_prop, max_val, seed=seed)
|
||||
|
||||
|
||||
|
||||
def embedding_matrices(self, lpretrained, supervised):
|
||||
"""
|
||||
Extract from pretrained embeddings words that are found in the training dataset, then for each language
|
||||
calls the respective monolingual index and build the embedding matrix (if supervised, WCE are concatenated
|
||||
to the unsupervised vectors).
|
||||
:param lpretrained: dict {lang : matrix of word-embeddings }
|
||||
:param supervised: bool, whether to deploy Word-Class Embeddings or not
|
||||
:return: self
|
||||
"""
|
||||
lXtr = self.get_lXtr() if supervised else none_dict(self.langs)
|
||||
lYtr = self.l_train_target() if supervised else none_dict(self.langs)
|
||||
for l,index in self.l_index.items():
|
||||
index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
|
||||
lWordList = self.get_wordlist()
|
||||
lExtracted = lpretrained.extract(lWordList)
|
||||
for lang, index in self.l_index.items():
|
||||
# if supervised concatenate embedding matrices of pretrained unsupervised
|
||||
# and supervised word-class embeddings
|
||||
index.compose_embedding_matrix(lExtracted[lang], supervised, lXtr[lang], lYtr[lang])
|
||||
self.sup_range = index.wce_range
|
||||
return self
|
||||
|
||||
# TODO circular import with transformers --> when generating posterior prob, we import PosteriorProbabilitiesEmbedder which is defined in transformers
|
||||
# def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
|
||||
# # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
|
||||
# timeit = time.time()
|
||||
# lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
|
||||
# lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
|
||||
# if not stored_post:
|
||||
# for l in self.langs:
|
||||
# n_elements = lXtr[l].shape[0]
|
||||
# if n_elements > max_training_docs_by_lang:
|
||||
# choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
|
||||
# lXtr[l] = lXtr[l][choice]
|
||||
# lYtr[l] = lYtr[l][choice]
|
||||
#
|
||||
# # train the posterior probabilities embedder
|
||||
# print('[posteriors] training a calibrated SVM')
|
||||
# learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
|
||||
# prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
|
||||
# prob_embedder.fit(lXtr, lYtr)
|
||||
#
|
||||
# # transforms the training, validation, and test sets into posterior probabilities
|
||||
# print('[posteriors] generating posterior probabilities')
|
||||
# lPtr = prob_embedder.transform(self.get_lXtr())
|
||||
# lPva = prob_embedder.transform(self.get_lXva())
|
||||
# lPte = prob_embedder.transform(self.get_lXte())
|
||||
# # NB: Check splits indices !
|
||||
# if store_posteriors:
|
||||
# import pickle
|
||||
# with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
|
||||
# pickle.dump([lPtr, lPva, lPte], outfile)
|
||||
# print(f'Successfully dumped posteriors!')
|
||||
# else:
|
||||
# import pickle
|
||||
# with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
|
||||
# lPtr, lPva, lPte = pickle.load(infile)
|
||||
# print(f'Successfully loaded stored posteriors!')
|
||||
# print(f'[posteriors] done in {time.time() - timeit}')
|
||||
# return lPtr, lPva, lPte
|
||||
|
||||
def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False):
|
||||
show_gpu('GPU memory before initializing mBert model:')
|
||||
# TODO: load dumped embeddings?
|
||||
from experiment_scripts.main_mbert_extractor import do_tokenization, ExtractorDataset, DataLoader
|
||||
from transformers import BertConfig, BertForSequenceClassification
|
||||
|
||||
print('[mBERT] generating mBERT doc embeddings')
|
||||
lXtr_raw = self.get_raw_lXtr()
|
||||
lXva_raw = self.get_raw_lXva()
|
||||
lXte_raw = self.get_raw_lXte()
|
||||
|
||||
print('# Tokenizing datasets')
|
||||
l_tokenized_tr = do_tokenization(lXtr_raw, max_len=max_len, verbose=False)
|
||||
tr_dataset = ExtractorDataset(l_tokenized_tr)
|
||||
tr_lang_ids = tr_dataset.lang_ids
|
||||
tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=False)
|
||||
|
||||
l_tokenized_va = do_tokenization(lXva_raw, max_len=max_len, verbose=False)
|
||||
va_dataset = ExtractorDataset(l_tokenized_va)
|
||||
va_lang_ids = va_dataset.lang_ids
|
||||
va_dataloader = DataLoader(va_dataset, batch_size=batch_size, shuffle=False)
|
||||
|
||||
l_tokenized_te = do_tokenization(lXte_raw, max_len=max_len, verbose=False)
|
||||
te_dataset = ExtractorDataset(l_tokenized_te)
|
||||
te_lang_ids = te_dataset.lang_ids
|
||||
te_dataloader = DataLoader(te_dataset, batch_size=batch_size, shuffle=False)
|
||||
|
||||
num_labels = self.l_index[self.langs[0]].val_target.shape[1]
|
||||
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
|
||||
num_labels=num_labels)
|
||||
model = BertForSequenceClassification.from_pretrained(bert_path,
|
||||
config=config).cuda()
|
||||
print('# Extracting document embeddings')
|
||||
tr_bert_embeddings, id2lang_tr = self.do_bert_embeddings(model, tr_dataloader, tr_lang_ids, verbose=False)
|
||||
va_bert_embeddings, id2lang_va = self.do_bert_embeddings(model, va_dataloader, va_lang_ids, verbose=False)
|
||||
te_bert_embeddings, id2lang_te = self.do_bert_embeddings(model, te_dataloader, te_lang_ids, verbose=False)
|
||||
|
||||
show_gpu('GPU memory before after mBert model:')
|
||||
# Freeing GPU's memory
|
||||
import gc
|
||||
del model, tr_dataloader, va_dataloader, te_dataloader
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
show_gpu('GPU memory after clearing cache:')
|
||||
return tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings
|
||||
|
||||
|
||||
@staticmethod
|
||||
def do_bert_embeddings(model, data, lang_ids, verbose=True):
|
||||
if verbose:
|
||||
print('# Feature Extractor Mode...')
|
||||
all_batch_embeddings = {}
|
||||
id2lang = {v: k for k, v in lang_ids.items()}
|
||||
with torch.no_grad():
|
||||
for batch, lang_idx in data:
|
||||
out = model(batch.cuda())
|
||||
last_hidden_state = out[1][-1]
|
||||
batch_embeddings = last_hidden_state[:, 0, :]
|
||||
for i, l_idx in enumerate(lang_idx.numpy()):
|
||||
if id2lang[l_idx] not in all_batch_embeddings.keys():
|
||||
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
|
||||
else:
|
||||
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
|
||||
batch_embeddings[i].detach().cpu().numpy()))
|
||||
|
||||
return all_batch_embeddings, id2lang
|
||||
def get_wordlist(self):
|
||||
wordlist = {}
|
||||
for lang, index in self.l_index.items():
|
||||
wordlist[lang] = index.get_word_list()
|
||||
return wordlist
|
||||
|
||||
def get_raw_lXtr(self):
|
||||
lXtr_raw = {k:[] for k in self.langs}
|
||||
lXtr_raw = {k: [] for k in self.langs}
|
||||
lYtr_raw = {k: [] for k in self.langs}
|
||||
for lang in self.langs:
|
||||
lXtr_raw[lang] = self.l_index[lang].train_raw
|
||||
|
@ -337,11 +229,14 @@ class MultilingualIndex:
|
|||
self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()})
|
||||
return self.lXte
|
||||
|
||||
def get_target_dim(self):
|
||||
return self.l_index[self.langs[0]].devel_target.shape[1]
|
||||
|
||||
def l_vocabsize(self):
|
||||
return {l:index.vocabsize for l,index in self.l_index.items()}
|
||||
return {l: index.vocabsize for l, index in self.l_index.items()}
|
||||
|
||||
def l_embeddings(self):
|
||||
return {l:index.embedding_matrix for l,index in self.l_index.items()}
|
||||
return {l: index.embedding_matrix for l, index in self.l_index.items()}
|
||||
|
||||
def l_pad(self):
|
||||
return {l: index.pad_index for l, index in self.l_index.items()}
|
||||
|
@ -349,15 +244,30 @@ class MultilingualIndex:
|
|||
def l_train_index(self):
|
||||
return {l: index.train_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_train_raw_index(self):
|
||||
return {l: index.train_raw for l, index in self.l_index.items()}
|
||||
|
||||
def l_train_target(self):
|
||||
return {l: index.train_target for l, index in self.l_index.items()}
|
||||
|
||||
def l_val_index(self):
|
||||
return {l: index.val_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_val_raw_index(self):
|
||||
return {l: index.val_raw for l, index in self.l_index.items()}
|
||||
|
||||
def l_test_raw_index(self):
|
||||
return {l: index.test_raw for l, index in self.l_index.items()}
|
||||
|
||||
def l_devel_raw_index(self):
|
||||
return {l: index.devel_raw for l, index in self.l_index.items()}
|
||||
|
||||
def l_val_target(self):
|
||||
return {l: index.val_target for l, index in self.l_index.items()}
|
||||
|
||||
def l_test_target(self):
|
||||
return {l: index.test_target for l, index in self.l_index.items()}
|
||||
|
||||
def l_test_index(self):
|
||||
return {l: index.test_index for l, index in self.l_index.items()}
|
||||
|
||||
|
@ -373,161 +283,179 @@ class MultilingualIndex:
|
|||
def l_val(self):
|
||||
return self.l_val_index(), self.l_val_target()
|
||||
|
||||
def l_test(self):
|
||||
return self.l_test_index(), self.l_test_target()
|
||||
|
||||
class Batch:
|
||||
def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
|
||||
self.batchsize = batchsize
|
||||
self.batches_per_epoch = batches_per_epoch
|
||||
self.languages = languages
|
||||
self.lpad=lpad
|
||||
self.max_pad_length=max_pad_length
|
||||
self.init_offset()
|
||||
def l_train_raw(self):
|
||||
return self.l_train_raw_index(), self.l_train_target()
|
||||
|
||||
def init_offset(self):
|
||||
self.offset = {lang: 0 for lang in self.languages}
|
||||
def l_val_raw(self):
|
||||
return self.l_val_raw_index(), self.l_val_target()
|
||||
|
||||
def batchify(self, l_index, l_post, l_bert, llabels): # TODO: add bert embedding here...
|
||||
langs = self.languages
|
||||
l_num_samples = {l:len(l_index[l]) for l in langs}
|
||||
def l_test_raw(self):
|
||||
return self.l_test_raw_index(), self.l_test_target()
|
||||
|
||||
max_samples = max(l_num_samples.values())
|
||||
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
|
||||
if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches:
|
||||
n_batches = self.batches_per_epoch
|
||||
def l_devel_raw(self):
|
||||
return self.l_devel_raw_index(), self.l_devel_target()
|
||||
|
||||
for b in range(n_batches):
|
||||
for lang in langs:
|
||||
index, labels = l_index[lang], llabels[lang]
|
||||
offset = self.offset[lang]
|
||||
if offset >= l_num_samples[lang]:
|
||||
offset = 0
|
||||
limit = offset+self.batchsize
|
||||
|
||||
batch_slice = slice(offset, limit)
|
||||
batch = index[batch_slice]
|
||||
batch_labels = labels[batch_slice].toarray()
|
||||
|
||||
post = None
|
||||
if l_post is not None:
|
||||
post = torch.FloatTensor(l_post[lang][batch_slice]).cuda()
|
||||
|
||||
bert_emb = None
|
||||
if l_bert is not None:
|
||||
bert_emb = torch.FloatTensor(l_bert[lang][batch_slice]).cuda()
|
||||
|
||||
batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
|
||||
|
||||
batch = torch.LongTensor(batch).cuda()
|
||||
target = torch.FloatTensor(batch_labels).cuda()
|
||||
|
||||
self.offset[lang] = limit
|
||||
|
||||
yield batch, post, bert_emb, target, lang
|
||||
def get_l_pad_index(self):
|
||||
return {l: index.get_pad_index() for l, index in self.l_index.items()}
|
||||
|
||||
|
||||
def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500):
|
||||
langs = sorted(l_index.keys())
|
||||
nsamples = max([len(l_index[l]) for l in langs])
|
||||
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
|
||||
for b in range(nbatches):
|
||||
for lang in langs:
|
||||
index, labels = l_index[lang], llabels[lang]
|
||||
class Index:
|
||||
def __init__(self, devel_raw, devel_target, test_raw, test_target, lang):
|
||||
"""
|
||||
Monolingual Index, takes care of tokenizing raw data, converting strings to ids, splitting the data into
|
||||
training and validation.
|
||||
:param devel_raw: list of strings, list of raw training texts
|
||||
:param devel_target:
|
||||
:param test_raw: list of strings, list of raw test texts
|
||||
:param lang: list, list of languages contained in the dataset
|
||||
"""
|
||||
self.lang = lang
|
||||
self.devel_raw = devel_raw
|
||||
self.devel_target = devel_target
|
||||
self.test_raw = test_raw
|
||||
self.test_target = test_target
|
||||
|
||||
if b * batchsize >= len(index):
|
||||
continue
|
||||
batch = index[b*batchsize:(b+1)*batchsize]
|
||||
batch_labels = labels[b*batchsize:(b+1)*batchsize].toarray()
|
||||
post = None
|
||||
if l_post is not None:
|
||||
post = torch.FloatTensor(l_post[lang][b*batchsize:(b+1)*batchsize]).cuda()
|
||||
batch = pad(batch, pad_index=lpad[lang], max_pad_length=max_pad_length)
|
||||
batch = torch.LongTensor(batch)
|
||||
target = torch.FloatTensor(batch_labels)
|
||||
yield batch.cuda(), post, target.cuda(), lang
|
||||
def index(self, pretrained_vocabulary, analyzer, vocabulary):
|
||||
self.word2index = dict(vocabulary)
|
||||
known_words = set(self.word2index.keys())
|
||||
if pretrained_vocabulary is not None:
|
||||
known_words.update(pretrained_vocabulary)
|
||||
|
||||
self.word2index['UNKTOKEN'] = len(self.word2index)
|
||||
self.word2index['PADTOKEN'] = len(self.word2index)
|
||||
self.unk_index = self.word2index['UNKTOKEN']
|
||||
self.pad_index = self.word2index['PADTOKEN']
|
||||
|
||||
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
|
||||
self.out_of_vocabulary = dict()
|
||||
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index,
|
||||
self.out_of_vocabulary)
|
||||
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index,
|
||||
self.out_of_vocabulary)
|
||||
|
||||
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
|
||||
|
||||
print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
|
||||
|
||||
def get_pad_index(self):
|
||||
return self.pad_index
|
||||
|
||||
def train_val_split(self, val_prop, max_val, seed):
|
||||
devel = self.devel_index
|
||||
target = self.devel_target
|
||||
devel_raw = self.devel_raw
|
||||
|
||||
val_size = int(min(len(devel) * val_prop, max_val))
|
||||
|
||||
self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
|
||||
train_test_split(
|
||||
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True)
|
||||
|
||||
print(
|
||||
f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
|
||||
|
||||
def get_word_list(self):
|
||||
def extract_word_list(word2index):
|
||||
return [w for w, i in sorted(word2index.items(), key=lambda x: x[1])]
|
||||
|
||||
word_list = extract_word_list(self.word2index)
|
||||
word_list += extract_word_list(self.out_of_vocabulary)
|
||||
return word_list
|
||||
|
||||
def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None):
|
||||
print(f'[generating embedding matrix for lang {self.lang}]')
|
||||
|
||||
self.wce_range = None
|
||||
embedding_parts = []
|
||||
|
||||
if pretrained is not None:
|
||||
print('\t[pretrained-matrix]')
|
||||
embedding_parts.append(pretrained)
|
||||
del pretrained
|
||||
|
||||
if supervised:
|
||||
print('\t[supervised-matrix]')
|
||||
F = supervised_embeddings_tfidf(Xtr, Ytr)
|
||||
num_missing_rows = self.vocabsize - F.shape[0]
|
||||
F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1]))))
|
||||
F = torch.from_numpy(F).float()
|
||||
|
||||
offset = 0
|
||||
if embedding_parts:
|
||||
offset = embedding_parts[0].shape[1]
|
||||
self.wce_range = [offset, offset + F.shape[1]]
|
||||
embedding_parts.append(F)
|
||||
|
||||
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
|
||||
|
||||
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
|
||||
|
||||
|
||||
def batchify_unlabelled(index_list, batchsize, pad_index, max_pad_length=500):
|
||||
nsamples = len(index_list)
|
||||
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
|
||||
for b in range(nbatches):
|
||||
batch = index_list[b*batchsize:(b+1)*batchsize]
|
||||
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
|
||||
batch = torch.LongTensor(batch)
|
||||
yield batch.cuda()
|
||||
|
||||
|
||||
def clip_gradient(model, clip_value=1e-1):
|
||||
params = list(filter(lambda p: p.grad is not None, model.parameters()))
|
||||
for p in params:
|
||||
p.grad.data.clamp_(-clip_value, clip_value)
|
||||
|
||||
|
||||
def predict(logits, classification_type='multilabel'):
|
||||
if classification_type == 'multilabel':
|
||||
prediction = torch.sigmoid(logits) > 0.5
|
||||
elif classification_type == 'singlelabel':
|
||||
prediction = torch.argmax(logits, dim=1).view(-1, 1)
|
||||
else:
|
||||
print('unknown classification type')
|
||||
|
||||
return prediction.detach().cpu().numpy()
|
||||
|
||||
|
||||
def count_parameters(model):
|
||||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
|
||||
def show_gpu(msg):
|
||||
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
|
||||
"""
|
||||
ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4
|
||||
Index (i.e., replaces word strings with numerical indexes) a list of string documents
|
||||
:param data: list of string documents
|
||||
:param vocab: a fixed mapping [str]->[int] of words to indexes
|
||||
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
|
||||
because they are anyway contained in a pre-trained embedding set that we know in advance)
|
||||
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
|
||||
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
|
||||
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
|
||||
are not in the original vocab but that are in the known_words
|
||||
:return:
|
||||
"""
|
||||
|
||||
def query(field):
|
||||
return (subprocess.check_output(
|
||||
['nvidia-smi', f'--query-gpu={field}',
|
||||
'--format=csv,nounits,noheader'],
|
||||
encoding='utf-8'))
|
||||
|
||||
def to_int(result):
|
||||
return int(result.strip().split('\n')[0])
|
||||
|
||||
used = to_int(query('memory.used'))
|
||||
total = to_int(query('memory.total'))
|
||||
pct = used / total
|
||||
print('\n' + msg, f'{100 * pct:2.1f}% ({used} out of {total})')
|
||||
indexes = []
|
||||
vocabsize = len(vocab)
|
||||
unk_count = 0
|
||||
knw_count = 0
|
||||
out_count = 0
|
||||
# pbar = tqdm(data, desc=f'indexing')
|
||||
for text in data:
|
||||
words = analyzer(text)
|
||||
index = []
|
||||
for word in words:
|
||||
if word in vocab:
|
||||
idx = vocab[word]
|
||||
else:
|
||||
if word in known_words:
|
||||
if word not in out_of_vocabulary:
|
||||
out_of_vocabulary[word] = vocabsize + len(out_of_vocabulary)
|
||||
idx = out_of_vocabulary[word]
|
||||
out_count += 1
|
||||
else:
|
||||
idx = unk_index
|
||||
unk_count += 1
|
||||
index.append(idx)
|
||||
indexes.append(index)
|
||||
knw_count += len(index)
|
||||
# pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
|
||||
# f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
|
||||
return indexes
|
||||
|
||||
|
||||
class TfidfVectorizerMultilingual:
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.kwargs = kwargs
|
||||
|
||||
def fit(self, lX, ly=None):
|
||||
self.langs = sorted(lX.keys())
|
||||
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
|
||||
|
||||
def fit_transform(self, lX, ly=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def vocabulary(self, l=None):
|
||||
if l is None:
|
||||
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].vocabulary_
|
||||
|
||||
def get_analyzer(self, l=None):
|
||||
if l is None:
|
||||
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].build_analyzer()
|
||||
def is_true(tensor, device):
|
||||
return torch.where(tensor == 1, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device))
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear', C=1):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False)
|
||||
def is_false(tensor, device):
|
||||
return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device))
|
||||
|
||||
|
||||
def define_pad_length(index_list):
|
||||
lengths = [len(index) for index in index_list]
|
||||
return int(np.mean(lengths) + np.std(lengths))
|
||||
|
||||
|
||||
def pad(index_list, pad_index, max_pad_length=None):
|
||||
pad_length = np.max([len(index) for index in index_list])
|
||||
if max_pad_length is not None:
|
||||
pad_length = min(pad_length, max_pad_length)
|
||||
for i, indexes in enumerate(index_list):
|
||||
index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length]
|
||||
return index_list
|
||||
|
||||
|
||||
def get_params(optimc=False):
|
||||
|
@ -538,20 +466,14 @@ def get_params(optimc=False):
|
|||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
||||
|
||||
def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru,
|
||||
gruMUSE, gruWCE, agg, allprob):
|
||||
_id = '-'
|
||||
_id_conf = [posteriors, supervised, pretrained, mbert, gru]
|
||||
def get_method_name(args):
|
||||
_id = ''
|
||||
_id_conf = [args.post_embedder, args.wce_embedder, args.muse_embedder, args.bert_embedder, args.gru_embedder]
|
||||
_id_name = ['X', 'W', 'M', 'B', 'G']
|
||||
for i, conf in enumerate(_id_conf):
|
||||
if conf:
|
||||
_id += _id_name[i]
|
||||
_id = _id if not gruMUSE else _id + '_muse'
|
||||
_id = _id if not gruWCE else _id + '_wce'
|
||||
_id = _id if not agg else _id + '_mean'
|
||||
_id = _id if not allprob else _id + '_allprob'
|
||||
|
||||
_dataset_path = dataset.split('/')[-1].split('_')
|
||||
_id = _id if not args.rnn_wce else _id + '_wce'
|
||||
_dataset_path = args.dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
return _id, dataset_id
|
||||
|
||||
|
|
|
@ -1,60 +0,0 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
pd.set_option('display.max_rows', 500)
|
||||
pd.set_option('display.max_columns', 500)
|
||||
pd.set_option('display.width', 1000)
|
||||
|
||||
|
||||
class CSVLog:
|
||||
|
||||
def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False):
|
||||
self.file = file
|
||||
self.autoflush = autoflush
|
||||
self.verbose = verbose
|
||||
if os.path.exists(file) and not overwrite:
|
||||
self.tell('Loading existing file from {}'.format(file))
|
||||
self.df = pd.read_csv(file, sep='\t')
|
||||
self.columns = sorted(self.df.columns.values.tolist())
|
||||
else:
|
||||
self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file))
|
||||
assert columns is not None, 'columns cannot be None'
|
||||
self.columns = sorted(columns)
|
||||
dir = os.path.dirname(self.file)
|
||||
if dir and not os.path.exists(dir): os.makedirs(dir)
|
||||
self.df = pd.DataFrame(columns=self.columns)
|
||||
self.defaults={}
|
||||
|
||||
def already_calculated(self, **kwargs):
|
||||
df = self.df
|
||||
if df.shape[0]==0:
|
||||
return False
|
||||
if len(kwargs)==0:
|
||||
kwargs = self.defaults
|
||||
for key,val in kwargs.items():
|
||||
df = df.loc[df[key]==val]
|
||||
if df.shape[0]==0: return False
|
||||
return True
|
||||
|
||||
def set_default(self, param, value):
|
||||
self.defaults[param]=value
|
||||
|
||||
def add_row(self, **kwargs):
|
||||
for key in self.defaults.keys():
|
||||
if key not in kwargs:
|
||||
kwargs[key]=self.defaults[key]
|
||||
colums = sorted(list(kwargs.keys()))
|
||||
values = [kwargs[col_i] for col_i in colums]
|
||||
s = pd.Series(values, index=self.columns)
|
||||
self.df = self.df.append(s, ignore_index=True)
|
||||
if self.autoflush: self.flush()
|
||||
# self.tell(s.to_string())
|
||||
self.tell(kwargs)
|
||||
|
||||
def flush(self):
|
||||
self.df.to_csv(self.file, index=False, sep='\t')
|
||||
|
||||
def tell(self, msg):
|
||||
if self.verbose: print(msg)
|
||||
|
||||
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
from sklearn.decomposition import PCA
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def run_pca(dim, X):
|
||||
"""
|
||||
:param dim: number of pca components to keep
|
||||
:param X: dictionary str(lang): matrix
|
||||
:return: dict lang: reduced matrix
|
||||
"""
|
||||
r = dict()
|
||||
pca = PCA(n_components=dim)
|
||||
for lang in X.keys():
|
||||
r[lang] = pca.fit_transform(X[lang])
|
||||
return r
|
||||
|
||||
|
||||
def get_optimal_dim(X, embed_type):
|
||||
"""
|
||||
:param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised
|
||||
:param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT)
|
||||
:return:
|
||||
"""
|
||||
_idx = []
|
||||
|
||||
plt.figure(figsize=(15, 10))
|
||||
if embed_type == 'U':
|
||||
plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance')
|
||||
else:
|
||||
plt.title(f'WCE Explained Variance')
|
||||
plt.xlabel('Number of Components')
|
||||
plt.ylabel('Variance (%)')
|
||||
|
||||
for lang in X.keys():
|
||||
pca = PCA(n_components=X[lang].shape[1])
|
||||
pca.fit(X[lang])
|
||||
_r = pca.explained_variance_ratio_
|
||||
_r = np.cumsum(_r)
|
||||
plt.plot(_r, label=lang)
|
||||
for i in range(len(_r) - 1, 1, -1):
|
||||
delta = _r[i] - _r[i - 1]
|
||||
if delta > 0:
|
||||
_idx.append(i)
|
||||
break
|
||||
best_n = max(_idx)
|
||||
plt.axvline(best_n, color='r', label='optimal N')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
return best_n
|
|
@ -1,71 +0,0 @@
|
|||
#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
|
||||
import torch
|
||||
from transformers import BertForSequenceClassification
|
||||
from time import time
|
||||
from util.file import create_if_not_exist
|
||||
import warnings
|
||||
|
||||
class EarlyStopping:
|
||||
|
||||
def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt', is_bert=False):
|
||||
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
|
||||
self.patience_limit = patience
|
||||
self.patience = patience
|
||||
self.verbose = verbose
|
||||
self.best_score = None
|
||||
self.best_epoch = None
|
||||
self.stop_time = None
|
||||
self.checkpoint = checkpoint
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.STOP = False
|
||||
self.is_bert = is_bert
|
||||
|
||||
def __call__(self, watch_score, epoch):
|
||||
|
||||
if self.STOP:
|
||||
return
|
||||
|
||||
if self.best_score is None or watch_score >= self.best_score:
|
||||
self.best_score = watch_score
|
||||
self.best_epoch = epoch
|
||||
self.stop_time = time()
|
||||
if self.checkpoint:
|
||||
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
|
||||
if self.is_bert:
|
||||
print(f'Serializing Huggingface model...')
|
||||
create_if_not_exist(self.checkpoint)
|
||||
self.model.save_pretrained(self.checkpoint)
|
||||
else:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
torch.save(self.model, self.checkpoint)
|
||||
# with open(self.checkpoint)
|
||||
# torch.save({'state_dict': self.model.state_dict(),
|
||||
# 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint)
|
||||
else:
|
||||
self.print(f'[early-stop] improved')
|
||||
self.patience = self.patience_limit
|
||||
else:
|
||||
self.patience -= 1
|
||||
if self.patience == 0:
|
||||
self.STOP = True
|
||||
self.print(f'[early-stop] patience exhausted')
|
||||
else:
|
||||
if self.patience>0: # if negative, then early-stop is ignored
|
||||
self.print(f'[early-stop] patience={self.patience}')
|
||||
|
||||
def reinit_counter(self):
|
||||
self.STOP = False
|
||||
self.patience=self.patience_limit
|
||||
|
||||
def restore_checkpoint(self):
|
||||
print(f'restoring best model from epoch {self.best_epoch}...')
|
||||
if self.is_bert:
|
||||
return BertForSequenceClassification.from_pretrained(self.checkpoint)
|
||||
else:
|
||||
return torch.load(self.checkpoint)
|
||||
|
||||
def print(self, msg):
|
||||
if self.verbose:
|
||||
print(msg)
|
|
@ -0,0 +1,104 @@
|
|||
from abc import ABC, abstractmethod
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torchtext.vocab import Vectors
|
||||
|
||||
from src.util.SIF_embed import remove_pc
|
||||
|
||||
|
||||
class PretrainedEmbeddings(ABC):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@abstractmethod
|
||||
def vocabulary(self): pass
|
||||
|
||||
@abstractmethod
|
||||
def dim(self): pass
|
||||
|
||||
@classmethod
|
||||
def reindex(cls, words, word2index):
|
||||
if isinstance(words, dict):
|
||||
words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0]
|
||||
|
||||
source_idx, target_idx = [], []
|
||||
for i, word in enumerate(words):
|
||||
if word not in word2index:
|
||||
continue
|
||||
j = word2index[word]
|
||||
source_idx.append(i)
|
||||
target_idx.append(j)
|
||||
source_idx = np.asarray(source_idx)
|
||||
target_idx = np.asarray(target_idx)
|
||||
return source_idx, target_idx
|
||||
|
||||
|
||||
class MuseLoader:
|
||||
def __init__(self, langs, cache):
|
||||
self.langs = langs
|
||||
self.lEmbed = {}
|
||||
self.lExtracted = {}
|
||||
for lang in self.langs:
|
||||
print(f'Loading vectors for {lang}...')
|
||||
self.lEmbed[lang] = Vectors(f'wiki.multi.{lang}.vec', cache)
|
||||
|
||||
def dim(self):
|
||||
return self.lEmbed[list(self.lEmbed.keys())[0]].dim
|
||||
|
||||
def vocabulary(self):
|
||||
return {lang: set(self.lEmbed[lang].stoi.keys()) for lang in self.langs}
|
||||
|
||||
def extract(self, lVoc):
|
||||
"""
|
||||
Reindex pretrained loaded embedding in order to match indexes assigned by scikit vectorizer. Such indexes
|
||||
are consistent with those used by Word Class Embeddings (since we deploy the same vectorizer)
|
||||
:param lVoc: dict {lang : {word : id}}
|
||||
:return: torch embedding matrix of extracted embeddings i.e., words in lVoc
|
||||
"""
|
||||
for lang, words in lVoc.items():
|
||||
print(f'Extracting words for lang {lang}...')
|
||||
# words = list(zip(*sorted(lVoc[lang].items(), key=lambda x: x[1])))[0]
|
||||
source_id, target_id = PretrainedEmbeddings.reindex(words, self.lEmbed[lang].stoi)
|
||||
extraction = torch.zeros((len(words), self.dim()))
|
||||
extraction[source_id] = self.lEmbed[lang].vectors[target_id]
|
||||
self.lExtracted[lang] = extraction
|
||||
return self.lExtracted
|
||||
|
||||
def get_lEmbeddings(self):
|
||||
return {lang: self.lEmbed[lang].vectors for lang in self.langs}
|
||||
|
||||
|
||||
def XdotM(X, M, sif):
|
||||
E = X.dot(M)
|
||||
if sif:
|
||||
E = remove_pc(E, npc=1)
|
||||
return E
|
||||
|
||||
|
||||
def wce_matrix(X, Y):
|
||||
wce = supervised_embeddings_tfidf(X, Y)
|
||||
wce = zscores(wce, axis=0)
|
||||
return wce
|
||||
|
||||
|
||||
def supervised_embeddings_tfidf(X, Y):
|
||||
tfidf_norm = X.sum(axis=0)
|
||||
tfidf_norm[tfidf_norm == 0] = 1
|
||||
F = (X.T).dot(Y) / tfidf_norm.T
|
||||
return F
|
||||
|
||||
|
||||
def zscores(X, axis=0):
|
||||
"""
|
||||
scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
||||
:param X:
|
||||
:param axis:
|
||||
:return:
|
||||
"""
|
||||
std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None)
|
||||
mean = np.mean(X, axis=axis)
|
||||
return (X - mean) / std
|
||||
|
||||
|
|
@ -1,102 +1,19 @@
|
|||
# from sklearn.externals.joblib import Parallel, delayed
|
||||
from joblib import Parallel, delayed
|
||||
from util.metrics import *
|
||||
from sklearn.metrics import f1_score
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
from src.util.metrics import *
|
||||
|
||||
|
||||
def evaluation_metrics(y, y_):
|
||||
if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
|
||||
raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
|
||||
else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
|
||||
if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label
|
||||
raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
|
||||
else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers
|
||||
return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
|
||||
|
||||
|
||||
def soft_evaluation_metrics(y, y_):
|
||||
if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
|
||||
raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
|
||||
else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
|
||||
return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_)
|
||||
|
||||
|
||||
def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
|
||||
print('evaluation (n_jobs={})'.format(n_jobs))
|
||||
if n_jobs == 1:
|
||||
return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()}
|
||||
else:
|
||||
langs = list(ly_true.keys())
|
||||
evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs)
|
||||
return {lang: evals[i] for i, lang in enumerate(langs)}
|
||||
|
||||
|
||||
def average_results(l_eval, show=True):
|
||||
metrics = []
|
||||
for lang in l_eval.keys():
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
if show:
|
||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||
|
||||
ave = np.mean(np.array(metrics), axis=0)
|
||||
if show:
|
||||
print('Averages: MF1, mF1, MK, mK', ave)
|
||||
return ave
|
||||
|
||||
|
||||
def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, return_time=False):
|
||||
tinit = time.time()
|
||||
print('prediction for test')
|
||||
assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
|
||||
n_jobs = polylingual_method.n_jobs if hasattr(polylingual_method, 'n_jobs') else -1
|
||||
|
||||
if predictor is None:
|
||||
predictor = polylingual_method.predict
|
||||
|
||||
metrics = evaluation_metrics
|
||||
if soft is True:
|
||||
metrics = soft_evaluation_metrics
|
||||
ly_ = predictor(lX, ly)
|
||||
|
||||
eval_ = evaluate(ly, ly_, metrics=metrics, n_jobs=n_jobs)
|
||||
if return_time:
|
||||
return eval_, time.time()-tinit
|
||||
else:
|
||||
return eval_
|
||||
|
||||
|
||||
def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False):
|
||||
print('prediction for test in a single language')
|
||||
if predictor is None:
|
||||
predictor = polylingual_method.predict
|
||||
|
||||
metrics = evaluation_metrics
|
||||
if soft is True:
|
||||
metrics = soft_evaluation_metrics
|
||||
|
||||
ly_ = predictor({lang:X})
|
||||
return metrics(y, ly_[lang])
|
||||
|
||||
|
||||
def get_binary_counters(polylingual_method, lX, ly, predictor=None):
|
||||
print('prediction for test')
|
||||
assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
|
||||
n_jobs = polylingual_method.n_jobs
|
||||
if predictor is None:
|
||||
predictor = polylingual_method.predict
|
||||
ly_ = predictor(lX)
|
||||
print('evaluation (n_jobs={})'.format(n_jobs))
|
||||
if n_jobs == 1:
|
||||
return {lang: binary_counters(ly[lang], ly_[lang]) for lang in ly.keys()}
|
||||
else:
|
||||
langs = list(ly.keys())
|
||||
evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs)
|
||||
return {lang: evals[i] for i, lang in enumerate(langs)}
|
||||
|
||||
|
||||
def binary_counters(y, y_):
|
||||
y = np.reshape(y, (-1))
|
||||
assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected'
|
||||
counters = hard_single_metric_statistics(y, y_)
|
||||
return counters.tp, counters.tn, counters.fp, counters.fn
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import urllib
|
||||
from os import listdir, makedirs
|
||||
from os.path import isdir, isfile, join, exists, dirname
|
||||
#from sklearn.externals.six.moves import urllib
|
||||
import urllib
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
|
@ -14,6 +13,7 @@ def download_file(url, archive_filename):
|
|||
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
|
||||
print("")
|
||||
|
||||
|
||||
def download_file_if_not_exists(url, archive_path):
|
||||
if exists(archive_path): return
|
||||
makedirs_if_not_exist(dirname(archive_path))
|
||||
|
@ -25,20 +25,26 @@ def ls(dir, typecheck):
|
|||
el.sort()
|
||||
return el
|
||||
|
||||
|
||||
def list_dirs(dir):
|
||||
return ls(dir, typecheck=isdir)
|
||||
|
||||
|
||||
def list_files(dir):
|
||||
return ls(dir, typecheck=isfile)
|
||||
|
||||
|
||||
def makedirs_if_not_exist(path):
|
||||
if not exists(path): makedirs(path)
|
||||
|
||||
|
||||
def create_if_not_exist(path):
|
||||
if not exists(path): makedirs(path)
|
||||
|
||||
|
||||
def get_parent_name(path):
|
||||
return Path(path).parent
|
||||
|
||||
|
||||
def get_file_name(path):
|
||||
return Path(path).name
|
||||
|
|
|
@ -1,24 +1,12 @@
|
|||
import numpy as np
|
||||
import numpy as np
|
||||
from scipy.sparse import lil_matrix, issparse
|
||||
from sklearn.metrics import f1_score, accuracy_score
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
|
||||
I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
|
||||
affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
|
||||
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
|
||||
classified all examples as negatives.
|
||||
"""
|
||||
|
||||
class ContTable:
|
||||
def __init__(self, tp=0, tn=0, fp=0, fn=0):
|
||||
self.tp=tp
|
||||
self.tn=tn
|
||||
self.fp=fp
|
||||
self.fn=fn
|
||||
self.tp = tp
|
||||
self.tn = tn
|
||||
self.fp = fp
|
||||
self.fn = fn
|
||||
|
||||
def get_d(self): return self.tp + self.tn + self.fp + self.fn
|
||||
|
||||
|
@ -57,16 +45,20 @@ class ContTable:
|
|||
def __add__(self, other):
|
||||
return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn)
|
||||
|
||||
|
||||
def accuracy(cell):
|
||||
return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn)
|
||||
|
||||
|
||||
def f1(cell):
|
||||
num = 2.0 * cell.tp
|
||||
den = 2.0 * cell.tp + cell.fp + cell.fn
|
||||
if den>0: return num / den
|
||||
#we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
|
||||
if den > 0:
|
||||
return num / den
|
||||
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
|
||||
return 1.0
|
||||
|
||||
|
||||
def K(cell):
|
||||
specificity, recall = 0., 0.
|
||||
|
||||
|
@ -85,45 +77,50 @@ def K(cell):
|
|||
else:
|
||||
return specificity + recall - 1.
|
||||
|
||||
#computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
|
||||
#true_labels and predicted_labels are two vectors of shape (number_documents,)
|
||||
def hard_single_metric_statistics(true_labels, predicted_labels):
|
||||
assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels."
|
||||
nd = len(true_labels)
|
||||
tp = np.sum(predicted_labels[true_labels==1])
|
||||
fp = np.sum(predicted_labels[true_labels == 0])
|
||||
fn = np.sum(true_labels[predicted_labels == 0])
|
||||
tn = nd - (tp+fp+fn)
|
||||
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
|
||||
|
||||
#computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
|
||||
# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
|
||||
# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
|
||||
def __check_consistency_and_adapt(true_labels, predictions):
|
||||
if predictions.ndim == 1:
|
||||
return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1))
|
||||
if true_labels.ndim == 1:
|
||||
return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1), predictions)
|
||||
if true_labels.shape != predictions.shape:
|
||||
raise ValueError("True and predicted label matrices shapes are inconsistent %s %s."
|
||||
% (true_labels.shape, predictions.shape))
|
||||
_, nC = true_labels.shape
|
||||
return true_labels, predictions, nC
|
||||
|
||||
|
||||
# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
|
||||
# probabilitiesfron with respect to the true binary labels
|
||||
#true_labels and posterior_probabilities are two vectors of shape (number_documents,)
|
||||
# true_labels and posterior_probabilities are two vectors of shape (number_documents,)
|
||||
def soft_single_metric_statistics(true_labels, posterior_probabilities):
|
||||
assert len(true_labels)==len(posterior_probabilities), "Format not consistent between true and predicted labels."
|
||||
assert len(true_labels) == len(posterior_probabilities), "Format not consistent between true and predicted labels."
|
||||
tp = np.sum(posterior_probabilities[true_labels == 1])
|
||||
fn = np.sum(1. - posterior_probabilities[true_labels == 1])
|
||||
fp = np.sum(posterior_probabilities[true_labels == 0])
|
||||
tn = np.sum(1. - posterior_probabilities[true_labels == 0])
|
||||
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
|
||||
|
||||
#if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
|
||||
#to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
|
||||
def __check_consistency_and_adapt(true_labels, predictions):
|
||||
if predictions.ndim == 1:
|
||||
return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1))
|
||||
if true_labels.ndim == 1:
|
||||
return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1),predictions)
|
||||
if true_labels.shape != predictions.shape:
|
||||
raise ValueError("True and predicted label matrices shapes are inconsistent %s %s."
|
||||
% (true_labels.shape, predictions.shape))
|
||||
_,nC = true_labels.shape
|
||||
return true_labels, predictions, nC
|
||||
|
||||
# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
|
||||
# true_labels and predicted_labels are two vectors of shape (number_documents,)
|
||||
def hard_single_metric_statistics(true_labels, predicted_labels):
|
||||
assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels."
|
||||
nd = len(true_labels)
|
||||
tp = np.sum(predicted_labels[true_labels == 1])
|
||||
fp = np.sum(predicted_labels[true_labels == 0])
|
||||
fn = np.sum(true_labels[predicted_labels == 0])
|
||||
tn = nd - (tp+fp+fn)
|
||||
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
|
||||
|
||||
|
||||
def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
|
||||
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
|
||||
return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)])
|
||||
|
||||
|
||||
def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
|
||||
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
|
||||
|
||||
|
@ -134,123 +131,22 @@ def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_
|
|||
|
||||
return metric(accum)
|
||||
|
||||
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def macroF1(true_labels, predicted_labels):
|
||||
return macro_average(true_labels,predicted_labels, f1)
|
||||
|
||||
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def macroF1(true_labels, predicted_labels):
|
||||
return macro_average(true_labels, predicted_labels, f1)
|
||||
|
||||
|
||||
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def microF1(true_labels, predicted_labels):
|
||||
return micro_average(true_labels, predicted_labels, f1)
|
||||
|
||||
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def macroK(true_labels, predicted_labels):
|
||||
return macro_average(true_labels,predicted_labels, K)
|
||||
|
||||
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def macroK(true_labels, predicted_labels):
|
||||
return macro_average(true_labels, predicted_labels, K)
|
||||
|
||||
|
||||
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def microK(true_labels, predicted_labels):
|
||||
return micro_average(true_labels, predicted_labels, K)
|
||||
|
||||
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
|
||||
#of the same shape containing real values in [0,1]
|
||||
def smoothmacroF1(true_labels, posterior_probabilities):
|
||||
return macro_average(true_labels,posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics)
|
||||
|
||||
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
|
||||
#of the same shape containing real values in [0,1]
|
||||
def smoothmicroF1(true_labels, posterior_probabilities):
|
||||
return micro_average(true_labels, posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics)
|
||||
|
||||
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
|
||||
#of the same shape containing real values in [0,1]
|
||||
def smoothmacroK(true_labels, posterior_probabilities):
|
||||
return macro_average(true_labels,posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
|
||||
|
||||
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
|
||||
#of the same shape containing real values in [0,1]
|
||||
def smoothmicroK(true_labels, posterior_probabilities):
|
||||
return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
|
||||
I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
|
||||
affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
|
||||
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
|
||||
classified all examples as negatives.
|
||||
"""
|
||||
|
||||
def evaluation(y_true, y_pred, classification_type):
|
||||
|
||||
if classification_type == 'multilabel':
|
||||
eval_function = multilabel_eval
|
||||
elif classification_type == 'singlelabel':
|
||||
eval_function = singlelabel_eval
|
||||
|
||||
Mf1, mf1, accuracy = eval_function(y_true, y_pred)
|
||||
|
||||
return Mf1, mf1, accuracy
|
||||
|
||||
|
||||
def multilabel_eval(y, y_):
|
||||
|
||||
tp = y.multiply(y_)
|
||||
|
||||
fn = lil_matrix(y.shape)
|
||||
true_ones = y==1
|
||||
fn[true_ones]=1-tp[true_ones]
|
||||
|
||||
fp = lil_matrix(y.shape)
|
||||
pred_ones = y_==1
|
||||
if pred_ones.nnz>0:
|
||||
fp[pred_ones]=1-tp[pred_ones]
|
||||
|
||||
#macro-f1
|
||||
tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten()
|
||||
fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten()
|
||||
fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten()
|
||||
|
||||
pos_pred = tp_macro+fp_macro
|
||||
pos_true = tp_macro+fn_macro
|
||||
prec=np.zeros(shape=tp_macro.shape,dtype=float)
|
||||
rec=np.zeros(shape=tp_macro.shape,dtype=float)
|
||||
np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0)
|
||||
np.divide(tp_macro, pos_true, out=rec, where=pos_true>0)
|
||||
den=prec+rec
|
||||
|
||||
macrof1=np.zeros(shape=tp_macro.shape,dtype=float)
|
||||
np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0)
|
||||
macrof1 *=2
|
||||
|
||||
macrof1[(pos_pred==0)*(pos_true==0)]=1
|
||||
macrof1 = np.mean(macrof1)
|
||||
|
||||
#micro-f1
|
||||
tp_micro = tp_macro.sum()
|
||||
fn_micro = fn_macro.sum()
|
||||
fp_micro = fp_macro.sum()
|
||||
pos_pred = tp_micro + fp_micro
|
||||
pos_true = tp_micro + fn_micro
|
||||
prec = (tp_micro / pos_pred) if pos_pred>0 else 0
|
||||
rec = (tp_micro / pos_true) if pos_true>0 else 0
|
||||
den = prec+rec
|
||||
microf1 = 2*prec*rec/den if den>0 else 0
|
||||
if pos_pred==pos_true==0:
|
||||
microf1=1
|
||||
|
||||
#accuracy
|
||||
ndecisions = np.multiply(*y.shape)
|
||||
tn = ndecisions - (tp_micro+fn_micro+fp_micro)
|
||||
acc = (tp_micro+tn)/ndecisions
|
||||
|
||||
return macrof1,microf1,acc
|
||||
|
||||
|
||||
def singlelabel_eval(y, y_):
|
||||
if issparse(y_): y_ = y_.toarray().flatten()
|
||||
macrof1 = f1_score(y, y_, average='macro')
|
||||
microf1 = f1_score(y, y_, average='micro')
|
||||
acc = accuracy_score(y, y_)
|
||||
return macrof1,microf1,acc
|
||||
|
||||
|
|
|
@ -1,91 +0,0 @@
|
|||
from optparse import OptionParser
|
||||
|
||||
parser = OptionParser(usage="usage: %prog datapath [options]")
|
||||
|
||||
parser.add_option("-d", dest='dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset')
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='../log/multiModal_log.csv')
|
||||
|
||||
parser.add_option("-X", "--posteriors", dest="posteriors", action='store_true',
|
||||
help="Add posterior probabilities to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-W", "--supervised", dest="supervised", action='store_true',
|
||||
help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-M", "--pretrained", dest="pretrained", action='store_true',
|
||||
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-B", "--mbert", dest="mbert", action='store_true',
|
||||
help="Add multilingual Bert (mBert) document embedding representation", default=False)
|
||||
|
||||
parser.add_option('-G', dest='gruViewGenerator', action='store_true',
|
||||
help="Add document embedding generated via recurrent net (GRU)", default=False)
|
||||
|
||||
parser.add_option("--l2", dest="l2", action='store_true',
|
||||
help="Activates l2 normalization as a post-processing for the document embedding views",
|
||||
default=False)
|
||||
|
||||
parser.add_option("--allprob", dest="allprob", action='store_true',
|
||||
help="All views are generated as posterior probabilities. This affects the supervised and pretrained"
|
||||
"embeddings, for which a calibrated classifier is generated, which generates the posteriors",
|
||||
default=False)
|
||||
|
||||
parser.add_option("--feat-weight", dest="feat_weight",
|
||||
help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf')
|
||||
|
||||
parser.add_option("-w", "--we-path", dest="we_path",
|
||||
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c", type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
||||
help="Optimize hyperparameters", default=False)
|
||||
|
||||
parser.add_option("-j", "--n_jobs", dest="n_jobs", type=int,
|
||||
help="Number of parallel jobs (default is -1, all)", default=-1)
|
||||
|
||||
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
||||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
||||
default=300)
|
||||
|
||||
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
|
||||
help="Remove common component when computing dot product of word embedding matrices", default=False)
|
||||
|
||||
parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
|
||||
help="Z-score normalize matrices (WCE and MUSE)", default=False)
|
||||
|
||||
parser.add_option("-a", "--agg", dest="agg", action='store_true',
|
||||
help="Set aggregation function of the common Z-space to average (Default: concatenation)",
|
||||
default=False)
|
||||
|
||||
# ------------------------------------------------------------------------------------
|
||||
|
||||
parser.add_option('--hidden', type=int, default=512, metavar='int',
|
||||
help='hidden lstm size (default: 512)')
|
||||
|
||||
parser.add_option('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]',
|
||||
help='dropout probability for the supervised matrix (default: 0.5)')
|
||||
|
||||
parser.add_option('--tunable', action='store_true', default=False,
|
||||
help='pretrained embeddings are tunable from the beginning (default False, i.e., static)')
|
||||
|
||||
parser.add_option('--logfile_gru', dest='logfile_gru', default='../log/log_gru_viewgenerator.csv')
|
||||
|
||||
parser.add_option('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
|
||||
|
||||
parser.add_option('--force', action='store_true', default=False,
|
||||
help='do not check if this experiment has already been run')
|
||||
|
||||
parser.add_option('--gruMuse', dest='gruMUSE', action='store_true', default=False,
|
||||
help='Deploy MUSE embedding as embedding layer of the GRU View Generator')
|
||||
|
||||
parser.add_option('--gruWce', dest='gruWCE', action='store_true', default=False,
|
||||
help='Deploy WCE embedding as embedding layer of the GRU View Generator')
|
||||
|
||||
parser.add_option('--gru-path', dest='gru_path', default=None,
|
||||
help='Set the path to a pretrained GRU model (aka, -G view generator)')
|
||||
|
||||
parser.add_option('--bert-path', dest='bert_path', default=None,
|
||||
help='Set the path to a pretrained mBERT model (aka, -B view generator)')
|
|
@ -0,0 +1,141 @@
|
|||
import torch
|
||||
from pytorch_lightning.metrics import Metric
|
||||
|
||||
from src.util.common import is_false, is_true
|
||||
|
||||
|
||||
def _update(pred, target, device):
|
||||
assert pred.shape == target.shape
|
||||
# preparing preds and targets for count
|
||||
true_pred = is_true(pred, device)
|
||||
false_pred = is_false(pred, device)
|
||||
true_target = is_true(target, device)
|
||||
false_target = is_false(target, device)
|
||||
|
||||
tp = torch.sum(true_pred * true_target, dim=0)
|
||||
tn = torch.sum(false_pred * false_target, dim=0)
|
||||
fp = torch.sum(true_pred * false_target, dim=0)
|
||||
fn = torch.sum(false_pred * target, dim=0)
|
||||
return tp, tn, fp, fn
|
||||
|
||||
|
||||
class CustomF1(Metric):
|
||||
def __init__(self, num_classes, device, average='micro'):
|
||||
"""
|
||||
Custom F1 metric.
|
||||
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
|
||||
I.e., when the number of true positives, false positives, and false negatives amount to 0, all
|
||||
affected metrics (precision, recall, and thus f1) output 0 in Scikit learn.
|
||||
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
|
||||
classified all examples as negatives.
|
||||
:param num_classes:
|
||||
:param device:
|
||||
:param average:
|
||||
"""
|
||||
super().__init__()
|
||||
self.num_classes = num_classes
|
||||
self.average = average
|
||||
self.device = 'cuda' if device else 'cpu'
|
||||
self.add_state('true_positive', default=torch.zeros(self.num_classes))
|
||||
self.add_state('true_negative', default=torch.zeros(self.num_classes))
|
||||
self.add_state('false_positive', default=torch.zeros(self.num_classes))
|
||||
self.add_state('false_negative', default=torch.zeros(self.num_classes))
|
||||
|
||||
def update(self, preds, target):
|
||||
true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device)
|
||||
|
||||
self.true_positive += true_positive
|
||||
self.true_negative += true_negative
|
||||
self.false_positive += false_positive
|
||||
self.false_negative += false_negative
|
||||
|
||||
def compute(self):
|
||||
if self.average == 'micro':
|
||||
num = 2.0 * self.true_positive.sum()
|
||||
den = 2.0 * self.true_positive.sum() + self.false_positive.sum() + self.false_negative.sum()
|
||||
if den > 0:
|
||||
return (num / den).to(self.device)
|
||||
return torch.FloatTensor([1.]).to(self.device)
|
||||
if self.average == 'macro':
|
||||
class_specific = []
|
||||
for i in range(self.num_classes):
|
||||
class_tp = self.true_positive[i]
|
||||
class_tn = self.true_negative[i]
|
||||
class_fp = self.false_positive[i]
|
||||
class_fn = self.false_negative[i]
|
||||
num = 2.0 * class_tp
|
||||
den = 2.0 * class_tp + class_fp + class_fn
|
||||
if den > 0:
|
||||
class_specific.append(num / den)
|
||||
else:
|
||||
class_specific.append(1.)
|
||||
average = torch.sum(torch.Tensor(class_specific))/self.num_classes
|
||||
return average.to(self.device)
|
||||
|
||||
|
||||
class CustomK(Metric):
|
||||
def __init__(self, num_classes, device, average='micro'):
|
||||
"""
|
||||
K metric. https://dl.acm.org/doi/10.1145/2808194.2809449
|
||||
:param num_classes:
|
||||
:param device:
|
||||
:param average:
|
||||
"""
|
||||
super().__init__()
|
||||
self.num_classes = num_classes
|
||||
self.average = average
|
||||
self.device = 'cuda' if device else 'cpu'
|
||||
self.add_state('true_positive', default=torch.zeros(self.num_classes))
|
||||
self.add_state('true_negative', default=torch.zeros(self.num_classes))
|
||||
self.add_state('false_positive', default=torch.zeros(self.num_classes))
|
||||
self.add_state('false_negative', default=torch.zeros(self.num_classes))
|
||||
|
||||
def update(self, preds, target):
|
||||
true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device)
|
||||
|
||||
self.true_positive += true_positive
|
||||
self.true_negative += true_negative
|
||||
self.false_positive += false_positive
|
||||
self.false_negative += false_negative
|
||||
|
||||
def compute(self):
|
||||
if self.average == 'micro':
|
||||
specificity, recall = 0., 0.
|
||||
absolute_negatives = self.true_negative.sum() + self.false_positive.sum()
|
||||
if absolute_negatives != 0:
|
||||
specificity = self.true_negative.sum()/absolute_negatives
|
||||
absolute_positives = self.true_positive.sum() + self.false_negative.sum()
|
||||
if absolute_positives != 0:
|
||||
recall = self.true_positive.sum()/absolute_positives
|
||||
|
||||
if absolute_positives == 0:
|
||||
return 2. * specificity - 1
|
||||
elif absolute_negatives == 0:
|
||||
return 2. * recall - 1
|
||||
else:
|
||||
return specificity + recall - 1
|
||||
|
||||
if self.average == 'macro':
|
||||
class_specific = []
|
||||
for i in range(self.num_classes):
|
||||
class_tp = self.true_positive[i]
|
||||
class_tn = self.true_negative[i]
|
||||
class_fp = self.false_positive[i]
|
||||
class_fn = self.false_negative[i]
|
||||
|
||||
specificity, recall = 0., 0.
|
||||
absolute_negatives = class_tn + class_fp
|
||||
if absolute_negatives != 0:
|
||||
specificity = class_tn / absolute_negatives
|
||||
absolute_positives = class_tp + class_fn
|
||||
if absolute_positives != 0:
|
||||
recall = class_tp / absolute_positives
|
||||
|
||||
if absolute_positives == 0:
|
||||
class_specific.append(2. * specificity - 1)
|
||||
elif absolute_negatives == 0:
|
||||
class_specific.append(2. * recall - 1)
|
||||
else:
|
||||
class_specific.append(specificity + recall - 1)
|
||||
average = torch.sum(torch.Tensor(class_specific)) / self.num_classes
|
||||
return average.to(self.device)
|
|
@ -1,21 +1,21 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
class PolylingualClassificationResults:
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class CSVlog:
|
||||
def __init__(self, file, autoflush=True, verbose=False):
|
||||
self.file = file
|
||||
self.columns = ['method',
|
||||
'learner',
|
||||
'optimp',
|
||||
'setting',
|
||||
'optimc',
|
||||
'sif',
|
||||
'zscore',
|
||||
'l2',
|
||||
'wescaler',
|
||||
'pca',
|
||||
'id',
|
||||
'dataset',
|
||||
'time',
|
||||
'time_tr',
|
||||
'time_te',
|
||||
'lang',
|
||||
'macrof1',
|
||||
'microf1',
|
||||
|
@ -36,8 +36,11 @@ class PolylingualClassificationResults:
|
|||
def already_calculated(self, id):
|
||||
return (self.df['id'] == id).any()
|
||||
|
||||
def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||
s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
||||
def add_row(self, method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang,
|
||||
macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||
s = pd.Series([method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang,
|
||||
macrof1, microf1, macrok, microk, notes],
|
||||
index=self.columns)
|
||||
self.df = self.df.append(s, ignore_index=True)
|
||||
if self.autoflush: self.flush()
|
||||
self.tell(s.to_string())
|
||||
|
@ -46,4 +49,5 @@ class PolylingualClassificationResults:
|
|||
self.df.to_csv(self.file, index=False, sep='\t')
|
||||
|
||||
def tell(self, msg):
|
||||
if self.verbose: print(msg)
|
||||
if self.verbose:
|
||||
print(msg)
|
|
@ -1,15 +1,20 @@
|
|||
import numpy as np
|
||||
|
||||
class StandardizeTransformer:
|
||||
|
||||
class StandardizeTransformer:
|
||||
def __init__(self, axis=0, range=None):
|
||||
"""
|
||||
|
||||
:param axis:
|
||||
:param range:
|
||||
"""
|
||||
assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice'
|
||||
self.axis = axis
|
||||
self.yetfit = False
|
||||
self.range = range
|
||||
|
||||
def fit(self, X):
|
||||
print('fitting Standardizer...')
|
||||
print('Applying z-score standardization...')
|
||||
std=np.std(X, axis=self.axis, ddof=1)
|
||||
self.std = np.clip(std, 1e-5, None)
|
||||
self.mean = np.mean(X, axis=self.axis)
|
||||
|
@ -28,4 +33,4 @@ class StandardizeTransformer:
|
|||
return (X - self.mean) / self.std
|
||||
|
||||
def fit_transform(self, X):
|
||||
return self.fit(X).transform(X)
|
||||
return self.fit(X).transform(X)
|
|
@ -1,29 +0,0 @@
|
|||
from sklearn.svm import SVC
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
||||
def mask_numbers(data, number_mask='numbermask'):
|
||||
mask = re.compile(r'\b[0-9][0-9.,-]*\b')
|
||||
masked = []
|
||||
for text in tqdm(data, desc='masking numbers'):
|
||||
masked.append(mask.sub(number_mask, text))
|
||||
return masked
|
||||
|
||||
|
||||
def fill_missing_classes(lXtr, lytr):
|
||||
pass
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
|
||||
|
||||
|
||||
def get_params(dense=False):
|
||||
if not op.optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = 'rbf' if dense else 'linear'
|
||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
|
@ -1,110 +0,0 @@
|
|||
import numpy as np
|
||||
import sklearn
|
||||
# from sklearn.externals.joblib import Parallel, delayed
|
||||
from joblib import Parallel, delayed
|
||||
|
||||
class ESA(object):
|
||||
"""
|
||||
Implementation of Explicit Sematic Analysis (ESA) in its mono-lingual version, as a transformer
|
||||
"""
|
||||
supported_similarity = ['dot', 'cosine']
|
||||
|
||||
def __init__(self, similarity='dot', centered=False, post=None):
|
||||
"""
|
||||
:param similarity: the similarity measure between documents to be used
|
||||
:param centered: set to True to subtract the expected similarity due to randomness (experimental)
|
||||
:param post: any valid sklearn normalization method to be applied to the resulting doc embeddings, or None (default)
|
||||
"""
|
||||
assert similarity in self.supported_similarity, ("Similarity method %s is not supported" % similarity)
|
||||
self.similarity = similarity
|
||||
self.centered = centered
|
||||
self.post_processing = post
|
||||
self.W = None
|
||||
|
||||
def fit(self, W):
|
||||
"""
|
||||
:param W: doc-by-term already processed matrix of wikipedia documents
|
||||
:return: self
|
||||
"""
|
||||
self.W = W
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
:param X: doc-by-term matrix that is to be transformed into the ESA space.
|
||||
:return: the matrix X transformed into the ESA space in numpy format
|
||||
"""
|
||||
assert self.W is not None, 'transform method called before fit'
|
||||
|
||||
W = self.W
|
||||
assert X.shape[1] == W.shape[1], ('the feature spaces for X=%s and W=%s do not agree' % (str(X.shape), str(W.shape)))
|
||||
|
||||
if self.similarity in ['dot', 'cosine']:
|
||||
if self.similarity == 'cosine':
|
||||
X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True)
|
||||
W = sklearn.preprocessing.normalize(W, norm='l2', axis=1, copy=True)
|
||||
|
||||
esa = (X.dot(W.T)).toarray()
|
||||
if self.centered:
|
||||
pX = (X > 0).sum(1) / float(X.shape[1])
|
||||
pW = (W > 0).sum(1) / float(W.shape[1])
|
||||
pXpW = np.sqrt(pX.dot(pW.transpose()))
|
||||
esa = esa - pXpW
|
||||
|
||||
if self.post_processing:
|
||||
esa = sklearn.preprocessing.normalize(esa, norm=self.post_processing, axis=1, copy=True)
|
||||
|
||||
return esa
|
||||
|
||||
def fit_transform(self, W, X, Y=None):
|
||||
self.fit(W)
|
||||
return self.transform(X, Y)
|
||||
|
||||
def dimensionality(self):
|
||||
return self.W.shape[0]
|
||||
|
||||
|
||||
|
||||
class CLESA(ESA):
|
||||
"""
|
||||
Implementation of Cross-Lingual Explicit Sematic Analysis (ESA) as a transformer
|
||||
"""
|
||||
|
||||
def __init__(self, similarity='dot', centered=False, post=False, n_jobs=-1):
|
||||
super(CLESA, self).__init__(similarity, centered, post)
|
||||
self.lESA = None
|
||||
self.langs = None
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, lW):
|
||||
"""
|
||||
:param lW: a dictionary of {language: doc-by-term wiki matrix}
|
||||
:return: self
|
||||
"""
|
||||
assert len(np.unique([W.shape[0] for W in lW.values()])) == 1, "inconsistent dimensions across languages"
|
||||
|
||||
self.dimensions = list(lW.values())[0].shape[0]
|
||||
self.langs = list(lW.keys())
|
||||
self.lESA = {lang:ESA(self.similarity, self.centered, self.post_processing).fit(lW[lang]) for lang in self.langs}
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
:param lX: dictionary of {language : doc-by-term matrix} that is to be transformed into the CL-ESA space
|
||||
:return: a dictionary {language : doc-by-dim matrix} containing the matrix-transformed versions
|
||||
"""
|
||||
assert self.lESA is not None, 'transform method called before fit'
|
||||
assert set(lX.keys()).issubset(set(self.langs)), 'languages in lX are not scope'
|
||||
langs = list(lX.keys())
|
||||
trans = Parallel(n_jobs=self.n_jobs)(delayed(self.lESA[lang].transform)(lX[lang]) for lang in langs)
|
||||
return {lang:trans[i] for i,lang in enumerate(langs)}
|
||||
|
||||
def fit_transform(self, lW, lX):
|
||||
return self.fit(lW).transform(lX)
|
||||
|
||||
def languages(self):
|
||||
return list(self.lESA.keys())
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,154 +0,0 @@
|
|||
import numpy as np
|
||||
from sklearn.preprocessing import normalize
|
||||
from scipy.sparse import csr_matrix, issparse
|
||||
from scipy.spatial.distance import cosine
|
||||
import operator
|
||||
import functools
|
||||
import math, sys
|
||||
# from sklearn.externals.joblib import Parallel, delayed
|
||||
from joblib import Parallel, delayed
|
||||
|
||||
|
||||
class DistributionalCorrespondenceIndexing:
|
||||
|
||||
prob_dcf = ['linear', 'pmi']
|
||||
vect_dcf = ['cosine']
|
||||
valid_dcf = prob_dcf + vect_dcf
|
||||
valid_post = ['normal', 'l2', None]
|
||||
|
||||
def __init__(self, dcf='cosine', post='normal', n_jobs=-1):
|
||||
"""
|
||||
:param dcf: a distributional correspondence function name (e.g., 'cosine') or a callable f(u,v) which measures
|
||||
the distribucional correspondence between vectors u and v
|
||||
:param post: post-processing function to apply to document embeddings. Default is to standardize it into a
|
||||
normal distribution; other functions allowed are 'l2' or None
|
||||
"""
|
||||
if post not in self.valid_post:
|
||||
raise ValueError("unknown post processing function; valid ones are [%s]" % ', '.join(self.valid_post))
|
||||
|
||||
if isinstance(dcf, str):
|
||||
if dcf not in self.valid_dcf:
|
||||
raise ValueError("unknown dcf; use any in [%s]" % ', '.join(self.valid_dcf))
|
||||
self.dcf = getattr(DistributionalCorrespondenceIndexing, dcf)
|
||||
elif hasattr(dcf, '__call__'):
|
||||
self.dcf = dcf
|
||||
else:
|
||||
raise ValueError('param dcf should either be a valid dcf name in [%s] or a callable comparing two vectors')
|
||||
#self.dcf = lambda u,v:dcf(u,v)
|
||||
self.post = post
|
||||
self.domains = None
|
||||
self.dFP = None
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, dU, dP):
|
||||
"""
|
||||
:param dU: a dictionary of {domain:dsm_matrix}, where dsm is a document-by-term matrix representing the
|
||||
distributional semantic model for a specific domain
|
||||
:param dP: a dictionary {domain:pivot_matrix} where domain is a string representing each domain,
|
||||
and pivot_matrix has shape (d,p) with d the dimensionality of the distributional space, and p the
|
||||
number of pivots
|
||||
:return: self
|
||||
"""
|
||||
self.domains = list(dP.keys())
|
||||
assert len(np.unique([P.shape[1] for P in dP.values()]))==1, "inconsistent number of pivots across domains"
|
||||
assert set(dU.keys())==set(self.domains), "inconsistent domains in dU and dP"
|
||||
assert not [1 for d in self.domains if dU[d].shape[0]!=dP[d].shape[0]], \
|
||||
"inconsistent dimensions between distributional and pivot spaces"
|
||||
self.dimensions = list(dP.values())[0].shape[1]
|
||||
# embed the feature space from each domain using the pivots of that domain
|
||||
#self.dFP = {d:self.dcf_dist(dU[d].transpose(), dP[d].transpose()) for d in self.domains}
|
||||
transformations = Parallel(n_jobs=self.n_jobs)(delayed(self.dcf_dist)(dU[d].transpose(),dP[d].transpose()) for d in self.domains)
|
||||
self.dFP = {d: transformations[i] for i, d in enumerate(self.domains)}
|
||||
|
||||
def _dom_transform(self, X, FP):
|
||||
_X = X.dot(FP)
|
||||
if self.post == 'l2':
|
||||
_X = normalize(_X, norm='l2', axis=1)
|
||||
elif self.post == 'normal':
|
||||
std = np.clip(np.std(_X, axis=0), 1e-5, None)
|
||||
_X = (_X - np.mean(_X, axis=0)) / std
|
||||
return _X
|
||||
|
||||
# dX is a dictionary of {domain:dsm}, where dsm (distributional semantic model) is, e.g., a document-by-term csr_matrix
|
||||
def transform(self, dX):
|
||||
assert self.dFP is not None, 'transform method called before fit'
|
||||
assert set(dX.keys()).issubset(self.domains), 'domains in dX are not scope'
|
||||
domains = list(dX.keys())
|
||||
transformations = Parallel(n_jobs=self.n_jobs)(delayed(self._dom_transform)(dX[d], self.dFP[d]) for d in domains)
|
||||
return {d: transformations[i] for i, d in enumerate(domains)}
|
||||
|
||||
def fit_transform(self, dU, dP, dX):
|
||||
return self.fit(dU, dP).transform(dX)
|
||||
|
||||
def _prevalence(self, v):
|
||||
if issparse(v):
|
||||
return float(v.nnz) / functools.reduce(operator.mul, v.shape, 1) #this works for arrays of any rank
|
||||
elif isinstance(v, np.ndarray):
|
||||
return float(v[v>0].size) / v.size
|
||||
|
||||
def linear(self, u, v, D):
|
||||
tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
|
||||
den1=tp+fn
|
||||
den2=tn+fp
|
||||
tpr = (tp*1./den1) if den1!=0 else 0.
|
||||
tnr = (tn*1./den2) if den2!=0 else 0.
|
||||
return tpr + tnr - 1
|
||||
|
||||
def pmi(self, u, v, D):
|
||||
tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
|
||||
|
||||
Pxy = tp * 1. / D
|
||||
Pxny = fp * 1. / D
|
||||
Pnxy = fn * 1. / D
|
||||
Px = Pxy + Pxny
|
||||
Py = Pxy + Pnxy
|
||||
|
||||
if (Px == 0 or Py == 0 or Pxy == 0):
|
||||
return 0.0
|
||||
|
||||
score = math.log2(Pxy / (Px * Py))
|
||||
if np.isnan(score) or np.isinf(score):
|
||||
print('NAN')
|
||||
sys.exit()
|
||||
return score
|
||||
|
||||
def cosine(self, u, v):
|
||||
pu = self._prevalence(u)
|
||||
pv = self._prevalence(v)
|
||||
return cosine(u, v) - np.sqrt(pu * pv)
|
||||
|
||||
def _get_4cellcounters(self, u, v, D):
|
||||
"""
|
||||
:param u: a set of indexes with a non-zero value
|
||||
:param v: a set of indexes with a non-zero value
|
||||
:param D: the number of events (i.e., all posible indexes)
|
||||
:return: the 4-cell contingency values tp, fp, fn, tn)
|
||||
"""
|
||||
common=u.intersection(v)
|
||||
tp = len(common)
|
||||
fp = len(u) - len(common)
|
||||
fn = len(v) - len(common)
|
||||
tn = D - (tp + fp + fn)
|
||||
return tp, fp, fn, tn
|
||||
|
||||
def dcf_dist(self, U, V):
|
||||
nU,D = U.shape
|
||||
nV = V.shape[0]
|
||||
if issparse(U): U = U.toarray()
|
||||
if issparse(V): V = V.toarray()
|
||||
|
||||
dists = np.zeros((nU, nV))
|
||||
if self.dcf.__name__ in self.prob_dcf:
|
||||
def hits_index(v):
|
||||
return set(np.argwhere(v>0).reshape(-1).tolist())
|
||||
Vhits = {i:hits_index(V[i]) for i in range(nV)}
|
||||
for i in range(nU):
|
||||
Ui_hits = hits_index(U[i])
|
||||
for j in range(nV):
|
||||
dists[i, j] = self.dcf(self, Ui_hits, Vhits[j], D)
|
||||
else:
|
||||
for i in range(nU):
|
||||
for j in range(nV):
|
||||
dists[i, j] = self.dcf(self, U[i], V[j])
|
||||
return dists
|
||||
|
|
@ -1,53 +0,0 @@
|
|||
import math
|
||||
import numpy as np
|
||||
from scipy.sparse import csr_matrix, issparse
|
||||
|
||||
class RandomIndexingBoC(object):
|
||||
|
||||
def __init__(self, latent_dimensions, non_zeros=2):
|
||||
self.latent_dimensions = latent_dimensions
|
||||
self.k = non_zeros
|
||||
self.ri_dict = None
|
||||
|
||||
def fit_transform(self, X):
|
||||
return self.fit(X).transform(X)
|
||||
|
||||
def fit(self, X):
|
||||
nF = X.shape[1]
|
||||
nL = self.latent_dimensions
|
||||
format = 'csr' if issparse(X) else 'np'
|
||||
self.ri_dict = _create_random_index_dictionary(shape=(nF, nL), k=self.k, normalized=True, format=format)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
assert X.shape[1] == self.ri_dict.shape[0], 'feature space is inconsistent with the RI dictionary'
|
||||
if self.ri_dict is None:
|
||||
raise ValueError("Error: transform method called before fit.")
|
||||
P = X.dot(self.ri_dict)
|
||||
if issparse(P):
|
||||
P.sort_indices()
|
||||
return P
|
||||
|
||||
|
||||
def _create_random_index_dictionary(shape, k, normalized=False, format='csr', positive=False):
|
||||
assert format in ['csr', 'np'], 'Format should be in "[csr, np]"'
|
||||
nF, latent_dimensions = shape
|
||||
print("Creating the random index dictionary for |V|={} with {} dimensions".format(nF,latent_dimensions))
|
||||
val = 1.0 if not normalized else 1.0/math.sqrt(k)
|
||||
#ri_dict = csr_matrix((nF, latent_dimensions)) if format == 'csr' else np.zeros((nF, latent_dimensions))
|
||||
ri_dict = np.zeros((nF, latent_dimensions))
|
||||
|
||||
#TODO: optimize
|
||||
for t in range(nF):
|
||||
dims = np.zeros(k, dtype=np.int32)
|
||||
dims[0] = t % latent_dimensions #the first dimension is choosen in a round-robin manner (prevents gaps)
|
||||
dims[1:] = np.random.choice(latent_dimensions, size=k-1, replace=False)
|
||||
values = (np.random.randint(0,2, size=k)*2.0-1.0) * val if not positive else np.array([+val]*k)
|
||||
ri_dict[t,dims]=values
|
||||
print("\rprogress [%.2f%% complete]" % (t * 100.0 / nF), end='')
|
||||
print('\nDone')
|
||||
|
||||
if format=='csr':
|
||||
ri_dict = csr_matrix(ri_dict)
|
||||
return ri_dict
|
||||
|
|
@ -0,0 +1,388 @@
|
|||
"""
|
||||
This module contains the view generators that take care of computing the view specific document embeddings:
|
||||
|
||||
- VanillaFunGen (-x) cast document representations encoded via TFIDF into posterior probabilities by means of SVM.
|
||||
|
||||
- WordClassGen (-w): generates document representation via Word-Class-Embeddings.
|
||||
Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
|
||||
- MuseGen (-m): generates document representation via MUSE embeddings.
|
||||
Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
|
||||
- RecurrentGen (-g): generates document embedding by means of a Gated Recurrent Units. The model can be
|
||||
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
|
||||
Output dimension is (n_docs, 512).
|
||||
|
||||
- View generator (-b): generates document embedding via mBERT model.
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
# from time import time
|
||||
|
||||
from pytorch_lightning import Trainer
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
|
||||
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
|
||||
|
||||
from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize
|
||||
from src.models.learners import *
|
||||
from src.models.pl_bert import BertModel
|
||||
from src.models.pl_gru import RecurrentModel
|
||||
from src.util.common import TfidfVectorizerMultilingual, _normalize, index
|
||||
from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix
|
||||
from src.util.file import create_if_not_exist
|
||||
# TODO: (1) add model checkpointing and loading from checkpoint + training on validation after convergence is reached
|
||||
|
||||
|
||||
class ViewGen(ABC):
|
||||
"""
|
||||
Abstract class for ViewGenerators implementations. Every ViewGen should implement these three methods in order to
|
||||
be seamlessly integrated in the overall architecture.
|
||||
"""
|
||||
@abstractmethod
|
||||
def fit(self, lX, ly):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def transform(self, lX):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fit_transform(self, lX, ly):
|
||||
pass
|
||||
|
||||
|
||||
class VanillaFunGen(ViewGen):
|
||||
"""
|
||||
View Generator (x): original funnelling architecture proposed by Moreo, Esuli and
|
||||
Sebastiani in DOI: https://doi.org/10.1145/3326065
|
||||
"""
|
||||
def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1):
|
||||
"""
|
||||
Init Posterior Probabilities embedder (i.e., VanillaFunGen)
|
||||
:param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to
|
||||
return posterior probabilities.
|
||||
:param base_learner:
|
||||
:param n_jobs: integer, number of concurrent workers
|
||||
"""
|
||||
super().__init__()
|
||||
self.learners = base_learner
|
||||
self.first_tier_parameters = first_tier_parameters
|
||||
self.n_jobs = n_jobs
|
||||
self.doc_projector = NaivePolylingualClassifier(base_learner=self.learners,
|
||||
parameters=self.first_tier_parameters, n_jobs=self.n_jobs)
|
||||
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
|
||||
def fit(self, lX, lY):
|
||||
print('# Fitting VanillaFunGen (X)...')
|
||||
lX = self.vectorizer.fit_transform(lX)
|
||||
self.doc_projector.fit(lX, lY)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
(1) Vectorize documents; (2) Project them according to the learners SVMs, finally (3) Apply L2 normalization
|
||||
to the projection and returns it.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:return: document projection to the common latent space.
|
||||
"""
|
||||
lX = self.vectorizer.transform(lX)
|
||||
lZ = self.doc_projector.predict_proba(lX)
|
||||
lZ = _normalize(lZ, l2=True)
|
||||
return lZ
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
class MuseGen(ViewGen):
|
||||
"""
|
||||
View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word
|
||||
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
"""
|
||||
def __init__(self, muse_dir='../embeddings', n_jobs=-1):
|
||||
"""
|
||||
Init the MuseGen.
|
||||
:param muse_dir: string, path to folder containing muse embeddings
|
||||
:param n_jobs: int, number of concurrent workers
|
||||
"""
|
||||
super().__init__()
|
||||
self.muse_dir = muse_dir
|
||||
self.n_jobs = n_jobs
|
||||
self.langs = None
|
||||
self.lMuse = None
|
||||
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
(1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:param ly: dict {lang: target vectors}
|
||||
:return: self.
|
||||
"""
|
||||
print('# Fitting MuseGen (M)...')
|
||||
self.vectorizer.fit(lX)
|
||||
self.langs = sorted(lX.keys())
|
||||
self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir)
|
||||
lVoc = self.vectorizer.vocabulary()
|
||||
self.lMuse = self.lMuse.extract(lVoc) # overwriting lMuse with dict {lang : embed_matrix} with only known words
|
||||
# TODO: featureweight.fit
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
(1) Vectorize documents; (2) computes the weighted sum of MUSE embeddings found at document level,
|
||||
finally (3) Apply L2 normalization embedding and returns it.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:return: document projection to the common latent space.
|
||||
"""
|
||||
lX = self.vectorizer.transform(lX)
|
||||
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs)
|
||||
lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)}
|
||||
lZ = _normalize(lZ, l2=True)
|
||||
return lZ
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
class WordClassGen(ViewGen):
|
||||
"""
|
||||
View Generator (w): generates document representation via Word-Class-Embeddings.
|
||||
Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
"""
|
||||
def __init__(self, n_jobs=-1):
|
||||
"""
|
||||
Init WordClassGen.
|
||||
:param n_jobs: int, number of concurrent workers
|
||||
"""
|
||||
super().__init__()
|
||||
self.n_jobs = n_jobs
|
||||
self.langs = None
|
||||
self.lWce = None
|
||||
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
(1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:param ly: dict {lang: target vectors}
|
||||
:return: self.
|
||||
"""
|
||||
print('# Fitting WordClassGen (W)...')
|
||||
lX = self.vectorizer.fit_transform(lX)
|
||||
self.langs = sorted(lX.keys())
|
||||
wce = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(wce_matrix)(lX[lang], ly[lang]) for lang in self.langs)
|
||||
self.lWce = {l: wce[i] for i, l in enumerate(self.langs)}
|
||||
# TODO: featureweight.fit()
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
(1) Vectorize documents; (2) computes the weighted sum of Word-Class Embeddings found at document level,
|
||||
finally (3) Apply L2 normalization embedding and returns it.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:return: document projection to the common latent space.
|
||||
"""
|
||||
lX = self.vectorizer.transform(lX)
|
||||
XdotWce = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs)
|
||||
lWce = {l: XdotWce[i] for i, l in enumerate(self.langs)}
|
||||
lWce = _normalize(lWce, l2=True)
|
||||
return lWce
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
class RecurrentGen(ViewGen):
|
||||
"""
|
||||
View Generator (G): generates document embedding by means of a Gated Recurrent Units. The model can be
|
||||
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
|
||||
Output dimension is (n_docs, 512). The training will happen end-to-end. At inference time, the model returns
|
||||
the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard.
|
||||
"""
|
||||
def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50,
|
||||
gpus=0, n_jobs=-1, patience=20, stored_path=None):
|
||||
"""
|
||||
Init RecurrentGen.
|
||||
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
|
||||
indexed by language code.
|
||||
:param pretrained_embeddings: dict {lang: tensor of embeddings}, it contains the pretrained embeddings to use
|
||||
as embedding layer.
|
||||
:param wce: Bool, whether to deploy Word-Class Embeddings (as proposed by A. Moreo). If True, supervised
|
||||
embeddings are concatenated to the deployed supervised embeddings. WCE dimensionality is equal to
|
||||
the number of target classes.
|
||||
:param batch_size: int, number of samples in a batch.
|
||||
:param nepochs: int, number of max epochs to train the model.
|
||||
:param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu.
|
||||
:param n_jobs: int, number of concurrent workers (i.e., parallelizing data loading).
|
||||
:param patience: int, number of epochs with no improvements in val-macroF1 before early stopping.
|
||||
:param stored_path: str, path to a pretrained model. If None the model will be trained from scratch.
|
||||
"""
|
||||
super().__init__()
|
||||
self.multilingualIndex = multilingualIndex
|
||||
self.langs = multilingualIndex.langs
|
||||
self.batch_size = batch_size
|
||||
self.gpus = gpus
|
||||
self.n_jobs = n_jobs
|
||||
self.stored_path = stored_path
|
||||
self.nepochs = nepochs
|
||||
self.patience = patience
|
||||
|
||||
# EMBEDDINGS to be deployed
|
||||
self.pretrained = pretrained_embeddings
|
||||
self.wce = wce
|
||||
|
||||
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
|
||||
self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce)
|
||||
self.model = self._init_model()
|
||||
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False)
|
||||
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
|
||||
patience=self.patience, verbose=False, mode='max')
|
||||
self.lr_monitor = LearningRateMonitor(logging_interval='epoch')
|
||||
|
||||
def _init_model(self):
|
||||
if self.stored_path:
|
||||
lpretrained = self.multilingualIndex.l_embeddings()
|
||||
return RecurrentModel.load_from_checkpoint(self.stored_path, lPretrained=lpretrained)
|
||||
else:
|
||||
lpretrained = self.multilingualIndex.l_embeddings()
|
||||
langs = self.multilingualIndex.langs
|
||||
output_size = self.multilingualIndex.get_target_dim()
|
||||
hidden_size = 512
|
||||
lvocab_size = self.multilingualIndex.l_vocabsize()
|
||||
learnable_length = 0
|
||||
return RecurrentModel(
|
||||
lPretrained=lpretrained,
|
||||
langs=langs,
|
||||
output_size=output_size,
|
||||
hidden_size=hidden_size,
|
||||
lVocab_size=lvocab_size,
|
||||
learnable_length=learnable_length,
|
||||
drop_embedding_range=self.multilingualIndex.sup_range,
|
||||
drop_embedding_prop=0.5,
|
||||
gpus=self.gpus
|
||||
)
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
Train the Neural Network end-to-end.
|
||||
lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation
|
||||
of the Dataset object (RecurrentDataset) in the GfunDataModule class.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:param ly: dict {lang: target vectors}
|
||||
:return: self.
|
||||
"""
|
||||
print('# Fitting RecurrentGen (G)...')
|
||||
create_if_not_exist(self.logger.save_dir)
|
||||
recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs)
|
||||
trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs,
|
||||
callbacks=[self.early_stop_callback, self.lr_monitor], checkpoint_callback=False)
|
||||
|
||||
# vanilla_torch_model = torch.load(
|
||||
# '../_old_checkpoint/gru_viewgen_-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle')
|
||||
# self.model.linear0 = vanilla_torch_model.linear0
|
||||
# self.model.linear1 = vanilla_torch_model.linear1
|
||||
# self.model.linear2 = vanilla_torch_model.linear2
|
||||
# self.model.rnn = vanilla_torch_model.rnn
|
||||
|
||||
trainer.fit(self.model, datamodule=recurrentDataModule)
|
||||
trainer.test(self.model, datamodule=recurrentDataModule)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
Project documents to the common latent space. Output dimensionality is 512.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:return: documents projected to the common latent space.
|
||||
"""
|
||||
data = {}
|
||||
for lang in lX.keys():
|
||||
indexed = index(data=lX[lang],
|
||||
vocab=self.multilingualIndex.l_index[lang].word2index,
|
||||
known_words=set(self.multilingualIndex.l_index[lang].word2index.keys()),
|
||||
analyzer=self.multilingualIndex.l_vectorizer.get_analyzer(lang),
|
||||
unk_index=self.multilingualIndex.l_index[lang].unk_index,
|
||||
out_of_vocabulary=self.multilingualIndex.l_index[lang].out_of_vocabulary)
|
||||
data[lang] = indexed
|
||||
l_pad = self.multilingualIndex.l_pad()
|
||||
self.model.to('cuda' if self.gpus else 'cpu')
|
||||
self.model.eval()
|
||||
l_embeds = self.model.encode(data, l_pad, batch_size=256)
|
||||
return l_embeds
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
class BertGen(ViewGen):
|
||||
"""
|
||||
View Generator (b): generates document embedding via Bert model. The training happens end-to-end.
|
||||
At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document
|
||||
embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard.
|
||||
"""
|
||||
def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, patience=5, stored_path=None):
|
||||
"""
|
||||
Init Bert model
|
||||
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
|
||||
indexed by language code.
|
||||
:param batch_size: int, number of samples per batch.
|
||||
:param nepochs: int, number of max epochs to train the model.
|
||||
:param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu.
|
||||
:param patience: int, number of epochs with no improvements in val-macroF1 before early stopping.
|
||||
:param n_jobs: int, number of concurrent workers.
|
||||
:param stored_path: str, path to a pretrained model. If None the model will be trained from scratch.
|
||||
"""
|
||||
super().__init__()
|
||||
self.multilingualIndex = multilingualIndex
|
||||
self.nepochs = nepochs
|
||||
self.gpus = gpus
|
||||
self.batch_size = batch_size
|
||||
self.n_jobs = n_jobs
|
||||
self.stored_path = stored_path
|
||||
self.model = self._init_model()
|
||||
self.patience = patience
|
||||
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False)
|
||||
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
|
||||
patience=self.patience, verbose=False, mode='max')
|
||||
|
||||
def _init_model(self):
|
||||
output_size = self.multilingualIndex.get_target_dim()
|
||||
return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus)
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
Train the Neural Network end-to-end.
|
||||
lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation
|
||||
of the Dataset object (RecurrentDataset) in the GfunDataModule class.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:param ly: dict {lang: target vectors}
|
||||
:return: self.
|
||||
"""
|
||||
print('# Fitting BertGen (M)...')
|
||||
create_if_not_exist(self.logger.save_dir)
|
||||
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
|
||||
bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512)
|
||||
trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus,
|
||||
logger=self.logger, callbacks=[self.early_stop_callback], checkpoint_callback=False)
|
||||
trainer.fit(self.model, datamodule=bertDataModule)
|
||||
trainer.test(self.model, datamodule=bertDataModule)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
Project documents to the common latent space. Output dimensionality is 768.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:return: documents projected to the common latent space.
|
||||
"""
|
||||
data = tokenize(lX, max_len=512)
|
||||
self.model.to('cuda' if self.gpus else 'cpu')
|
||||
self.model.eval()
|
||||
l_embeds = self.model.encode(data, batch_size=64)
|
||||
return l_embeds
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
# we can assume that we have already indexed data for transform() since we are first calling fit()
|
||||
return self.fit(lX, ly).transform(lX)
|
Loading…
Reference in New Issue