merged devel

This commit is contained in:
andrea 2021-02-03 11:20:08 +01:00
commit e78b1f8a30
71 changed files with 2163 additions and 4385 deletions

190
main.py Normal file
View File

@ -0,0 +1,190 @@
from argparse import ArgumentParser
from src.data.dataset_builder import MultilingualDataset
from src.funnelling import *
from src.util.common import MultilingualIndex, get_params, get_method_name
from src.util.evaluation import evaluate
from src.util.results_csv import CSVlog
from src.view_generators import *
def main(args):
assert args.post_embedder or args.muse_embedder or args.wce_embedder or args.gru_embedder or args.bert_embedder, \
'empty set of document embeddings is not allowed!'
print('Running generalized funnelling...')
data = MultilingualDataset.load(args.dataset)
# data.set_view(languages=['it', 'da'])
data.show_dimensions()
lX, ly = data.training()
lXte, lyte = data.test()
# Init multilingualIndex - mandatory when deploying Neural View Generators...
if args.gru_embedder or args.bert_embedder:
multilingualIndex = MultilingualIndex()
lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir)
multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary())
# Init ViewGenerators and append them to embedder_list
embedder_list = []
if args.post_embedder:
posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs)
embedder_list.append(posteriorEmbedder)
if args.muse_embedder:
museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs)
embedder_list.append(museEmbedder)
if args.wce_embedder:
wceEmbedder = WordClassGen(n_jobs=args.n_jobs)
embedder_list.append(wceEmbedder)
if args.gru_embedder:
rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.rnn_wce,
batch_size=args.batch_rnn, nepochs=args.nepochs_rnn, patience=args.patience_rnn,
gpus=args.gpus, n_jobs=args.n_jobs)
embedder_list.append(rnnEmbedder)
if args.bert_embedder:
bertEmbedder = BertGen(multilingualIndex, batch_size=args.batch_bert, nepochs=args.nepochs_bert,
patience=args.patience_bert, gpus=args.gpus, n_jobs=args.n_jobs)
bertEmbedder.transform(lX)
embedder_list.append(bertEmbedder)
# Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier
docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True)
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'),
meta_parameters=get_params(optimc=args.optimc))
# Init Funnelling Architecture
gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta)
# Training ---------------------------------------
print('\n[Training Generalized Funnelling]')
time_init = time.time()
gfun.fit(lX, ly)
time_tr = round(time.time() - time_init, 3)
print(f'Training completed in {time_tr} seconds!')
# Testing ----------------------------------------
print('\n[Testing Generalized Funnelling]')
time_te = time.time()
ly_ = gfun.predict(lXte)
l_eval = evaluate(ly_true=lyte, ly_pred=ly_)
time_te = round(time.time() - time_te, 3)
print(f'Testing completed in {time_te} seconds!')
# Logging ---------------------------------------
print('\n[Results]')
results = CSVlog(args.csv_dir)
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}')
if results is not None:
_id, _dataset = get_method_name(args)
results.add_row(method='gfun',
setting=_id,
optimc=args.optimc,
sif='True',
zscore='True',
l2='True',
dataset=_dataset,
time_tr=time_tr,
time_te=time_te,
lang=lang,
macrof1=macrof1,
microf1=microf1,
macrok=macrok,
microk=microk,
notes='')
print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))
overall_time = round(time.time() - time_init, 3)
exit(f'\nExecuted in: {overall_time} seconds!')
if __name__ == '__main__':
parser = ArgumentParser(description='Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani')
parser.add_argument('dataset', help='Path to the dataset')
parser.add_argument('-o', '--output', dest='csv_dir', metavar='',
help='Result file (default csv_logs/gfun/gfun_results.csv)', type=str,
default='csv_logs/gfun/gfun_results.csv')
parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true',
help='deploy posterior probabilities embedder to compute document embeddings',
default=False)
parser.add_argument('-w', '--wce_embedder', dest='wce_embedder', action='store_true',
help='deploy (supervised) Word-Class embedder to the compute document embeddings',
default=False)
parser.add_argument('-m', '--muse_embedder', dest='muse_embedder', action='store_true',
help='deploy (pretrained) MUSE embedder to compute document embeddings',
default=False)
parser.add_argument('-b', '--bert_embedder', dest='bert_embedder', action='store_true',
help='deploy multilingual Bert to compute document embeddings',
default=False)
parser.add_argument('-g', '--gru_embedder', dest='gru_embedder', action='store_true',
help='deploy a GRU in order to compute document embeddings (a.k.a., RecurrentGen)',
default=False)
parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true',
help='Optimize SVMs C hyperparameter at metaclassifier level',
default=False)
parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, metavar='',
help='number of parallel jobs (default is -1, all)',
default=-1)
parser.add_argument('--nepochs_rnn', dest='nepochs_rnn', type=int, metavar='',
help='number of max epochs to train Recurrent embedder (i.e., -g), default 150',
default=150)
parser.add_argument('--nepochs_bert', dest='nepochs_bert', type=int, metavar='',
help='number of max epochs to train Bert model (i.e., -g), default 10',
default=10)
parser.add_argument('--patience_rnn', dest='patience_rnn', type=int, metavar='',
help='set early stop patience for the RecurrentGen, default 25',
default=25)
parser.add_argument('--patience_bert', dest='patience_bert', type=int, metavar='',
help='set early stop patience for the BertGen, default 5',
default=5)
parser.add_argument('--batch_rnn', dest='batch_rnn', type=int, metavar='',
help='set batchsize for the RecurrentGen, default 64',
default=64)
parser.add_argument('--batch_bert', dest='batch_bert', type=int, metavar='',
help='set batchsize for the BertGen, default 4',
default=4)
parser.add_argument('--muse_dir', dest='muse_dir', type=str, metavar='',
help='Path to the MUSE polylingual word embeddings (default embeddings/)',
default='embeddings/')
parser.add_argument('--rnn_wce', dest='rnn_wce', action='store_true',
help='Deploy WCE embedding as embedding layer of the RecurrentGen',
default=False)
parser.add_argument('--rnn_dir', dest='rnn_dir', type=str, metavar='',
help='Set the path to a pretrained RNN model (i.e., -g view generator)',
default=None)
parser.add_argument('--bert_dir', dest='bert_dir', type=str, metavar='',
help='Set the path to a pretrained mBERT model (i.e., -b view generator)',
default=None)
parser.add_argument('--gpus', metavar='', help='specifies how many GPUs to use per node',
default=None)
args = parser.parse_args()
main(args)

52
readme.md Normal file
View File

@ -0,0 +1,52 @@
# Generalized Funnelling (gFun)
## Requirements
```commandline
transformers==2.11.0
pandas==0.25.3
numpy==1.17.4
joblib==0.14.0
tqdm==4.50.2
pytorch_lightning==1.1.2
torch==1.3.1
nltk==3.4.5
scipy==1.3.3
rdflib==4.2.2
torchtext==0.4.0
scikit_learn==0.24.1
```
## Usage
```commandline
usage: main.py [-h] [-o CSV_DIR] [-x] [-w] [-m] [-b] [-g] [-c] [-n NEPOCHS]
[-j N_JOBS] [--muse_dir MUSE_DIR] [--gru_wce]
[--gru_dir GRU_DIR] [--bert_dir BERT_DIR] [--gpus GPUS]
dataset
Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani (2020).
positional arguments:
dataset Path to the dataset
optional arguments:
-h, --help show this help message and exit
-o, --output result file (default ../csv_logs/gfun/gfun_results.csv)
-x, --post_embedder deploy posterior probabilities embedder to compute document embeddings
-w, --wce_embedder deploy (supervised) Word-Class embedder to the compute document embeddings
-m, --muse_embedder deploy (pretrained) MUSE embedder to compute document embeddings
-b, --bert_embedder deploy multilingual Bert to compute document embeddings
-g, --gru_embedder deploy a GRU in order to compute document embeddings
-c, --c_optimize optimize SVMs C hyperparameter
-j, --n_jobs number of parallel jobs, default is -1 i.e., all
--nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150
--nepochs_bert number of max epochs to train Bert model (i.e., -g), default 10
--patience_rnn set early stop patience for the RecurrentGen, default 25
--patience_bert set early stop patience for the BertGen, default 5
--batch_rnn set batchsize for the RecurrentGen, default 64
--batch_bert set batchsize for the BertGen, default 4
--muse_dir path to the MUSE polylingual word embeddings (default ../embeddings)
--gru_wce deploy WCE embedding as embedding layer of the GRU View Generator
--rnn_dir set the path to a pretrained RNN model (i.e., -g view generator)
--bert_dir set the path to a pretrained mBERT model (i.e., -b view generator)
--gpus specifies how many GPUs to use per node
```

12
requirements.txt Normal file
View File

@ -0,0 +1,12 @@
transformers==2.11.0
pandas==0.25.3
numpy==1.17.4
joblib==0.14.0
tqdm==4.50.2
pytorch_lightning==1.1.2
torch==1.3.1
nltk==3.4.5
scipy==1.3.3
rdflib==4.2.2
torchtext==0.4.0
scikit_learn==0.24.1

8
run.sh Normal file
View File

@ -0,0 +1,8 @@
#!/usr/bin/env bash
python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -g --gpus 0
#for i in {0..10..1}
#do
# python main.py --gpus 0
#done

222
src/data/datamodule.py Normal file
View File

@ -0,0 +1,222 @@
import numpy as np
import pytorch_lightning as pl
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
N_WORKERS = 8
class RecurrentDataset(Dataset):
def __init__(self, lX, ly, lPad_index):
"""
:param lX: dict {lang_id : np.ndarray}
:param ly:
"""
self.lX = []
self.ly = []
self.lOffset = {}
self.lPad_index = lPad_index
for lang, data in lX.items():
offset = [len(self.lX)]
self.lX.extend(data)
offset.append(len(self.lX))
self.lOffset[lang] = offset
for lang, target in ly.items():
self.ly.extend(target)
def __len__(self):
return len(self.lX)
def __getitem__(self, index):
X = self.lX[index]
y = self.ly[index]
return X, y, index, self._get_lang(index)
def _get_lang(self, index):
for lang, l_range in self.lOffset.items():
if index in range(l_range[0], l_range[1]):
return lang
def collate_fn(self, data):
"""
Takes care of padding the batch and also check consistency of batch languages. Groups into dict {lang : lang_batch}
items sampled from the Dataset class.
:param data:
:return:
"""
lX_batch = {}
ly_batch = {}
current_lang = data[0][-1]
for d in data:
if d[-1] == current_lang:
if current_lang not in lX_batch.keys():
lX_batch[current_lang] = []
ly_batch[current_lang] = []
lX_batch[current_lang].append(d[0])
ly_batch[current_lang].append(d[1])
else:
current_lang = d[-1]
lX_batch[current_lang] = []
ly_batch[current_lang] = []
lX_batch[current_lang].append(d[0])
ly_batch[current_lang].append(d[1])
for lang in lX_batch.keys():
lX_batch[lang] = self.pad(lX_batch[lang], pad_index=self.lPad_index[lang],
max_pad_length=self.define_pad_length(lX_batch[lang]))
lX_batch[lang] = torch.LongTensor(lX_batch[lang])
ly_batch[lang] = torch.FloatTensor(ly_batch[lang])
return lX_batch, ly_batch
@staticmethod
def define_pad_length(index_list):
lengths = [len(index) for index in index_list]
return int(np.mean(lengths) + np.std(lengths))
@staticmethod
def pad(index_list, pad_index, max_pad_length=None):
pad_length = np.max([len(index) for index in index_list])
if max_pad_length is not None:
pad_length = min(pad_length, max_pad_length)
for i, indexes in enumerate(index_list):
index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length]
return index_list
class RecurrentDataModule(pl.LightningDataModule):
"""
Pytorch Lightning Datamodule to be deployed with RecurrentGen.
https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
"""
def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1):
"""
Init RecurrentDataModule.
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
indexed by language code.
:param batchsize: int, number of sample per batch.
:param n_jobs: int, number of concurrent workers to be deployed (i.e., parallelizing data loading).
"""
self.multilingualIndex = multilingualIndex
self.batchsize = batchsize
self.n_jobs = n_jobs
super().__init__()
def prepare_data(self, *args, **kwargs):
pass
def setup(self, stage=None):
if stage == 'fit' or stage is None:
l_train_index, l_train_target = self.multilingualIndex.l_train()
# Debug settings: reducing number of samples
# l_train_index = {l: train[:5] for l, train in l_train_index.items()}
# l_train_target = {l: target[:5] for l, target in l_train_target.items()}
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
lPad_index=self.multilingualIndex.l_pad())
l_val_index, l_val_target = self.multilingualIndex.l_val()
# Debug settings: reducing number of samples
# l_val_index = {l: train[:5] for l, train in l_val_index.items()}
# l_val_target = {l: target[:5] for l, target in l_val_target.items()}
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
lPad_index=self.multilingualIndex.l_pad())
if stage == 'test' or stage is None:
l_test_index, l_test_target = self.multilingualIndex.l_test()
# Debug settings: reducing number of samples
# l_test_index = {l: train[:5] for l, train in l_test_index.items()}
# l_test_target = {l: target[:5] for l, target in l_test_target.items()}
self.test_dataset = RecurrentDataset(l_test_index, l_test_target,
lPad_index=self.multilingualIndex.l_pad())
def train_dataloader(self):
return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
collate_fn=self.training_dataset.collate_fn)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
collate_fn=self.val_dataset.collate_fn)
def test_dataloader(self):
return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
collate_fn=self.test_dataset.collate_fn)
def tokenize(l_raw, max_len):
"""
run Bert tokenization on dict {lang: list of samples}.
:param l_raw:
:param max_len:
:return:
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
l_tokenized = {}
for lang in l_raw.keys():
output_tokenizer = tokenizer(l_raw[lang], truncation=True, max_length=max_len, padding='max_length')
l_tokenized[lang] = output_tokenizer['input_ids']
return l_tokenized
class BertDataModule(RecurrentDataModule):
"""
Pytorch Lightning Datamodule to be deployed with BertGen.
https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
"""
def __init__(self, multilingualIndex, batchsize=64, max_len=512):
"""
Init BertDataModule.
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
indexed by language code.
:param batchsize: int, number of sample per batch.
:param max_len: int, max number of token per document. Absolute cap is 512.
"""
super().__init__(multilingualIndex, batchsize)
self.max_len = max_len
def setup(self, stage=None):
if stage == 'fit' or stage is None:
l_train_raw, l_train_target = self.multilingualIndex.l_train_raw()
# Debug settings: reducing number of samples
# l_train_raw = {l: train[:5] for l, train in l_train_raw.items()}
# l_train_target = {l: target[:5] for l, target in l_train_target.items()}
l_train_index = tokenize(l_train_raw, max_len=self.max_len)
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
lPad_index=self.multilingualIndex.l_pad())
l_val_raw, l_val_target = self.multilingualIndex.l_val_raw()
# Debug settings: reducing number of samples
# l_val_raw = {l: train[:5] for l, train in l_val_raw.items()}
# l_val_target = {l: target[:5] for l, target in l_val_target.items()}
l_val_index = tokenize(l_val_raw, max_len=self.max_len)
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
lPad_index=self.multilingualIndex.l_pad())
if stage == 'test' or stage is None:
l_test_raw, l_test_target = self.multilingualIndex.l_test_raw()
# Debug settings: reducing number of samples
# l_test_raw = {l: train[:5] for l, train in l_test_raw.items()}
# l_test_target = {l: target[:5] for l, target in l_test_target.items()}
l_test_index = tokenize(l_test_raw, max_len=self.max_len)
self.test_dataset = RecurrentDataset(l_test_index, l_test_target,
lPad_index=self.multilingualIndex.l_pad())
def train_dataloader(self):
"""
NB: Setting n_workers to > 0 will cause "OSError: [Errno 24] Too many open files"
:return:
"""
return DataLoader(self.training_dataset, batch_size=self.batchsize)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size=self.batchsize)
def test_dataloader(self):
return DataLoader(self.test_dataset, batch_size=self.batchsize)

View File

@ -1,19 +1,20 @@
from os.path import join, exists
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from data.reader.jrcacquis_reader import *
from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import issparse
import itertools
from tqdm import tqdm
import re
from os.path import exists
import numpy as np
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from scipy.sparse import issparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from src.data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
from src.data.reader.jrcacquis_reader import *
from src.data.reader.rcv_reader import fetch_RCV1, fetch_RCV2
from src.data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
class MultilingualDataset:

View File

@ -1,19 +1,22 @@
from __future__ import print_function
import os, sys
from os.path import join
import os
import pickle
import sys
import tarfile
import xml.etree.ElementTree as ET
from sklearn.datasets import get_data_home
import pickle
from util.file import download_file, list_dirs, list_files
import zipfile
from collections import Counter
from os.path import join
from random import shuffle
import rdflib
from rdflib.namespace import RDF, SKOS
from rdflib import URIRef
import zipfile
from data.languages import JRC_LANGS
from collections import Counter
from random import shuffle
from data.languages import lang_set
from sklearn.datasets import get_data_home
from src.data.languages import JRC_LANGS
from src.data.languages import lang_set
from src.util.file import download_file, list_dirs, list_files
"""
JRC Acquis' Nomenclature:

View File

@ -1,15 +1,12 @@
from zipfile import ZipFile
import xml.etree.ElementTree as ET
from data.languages import RCV2_LANGS_WITH_NLTK_STEMMING, RCV2_LANGS
from util.file import list_files
from sklearn.datasets import get_data_home
import gzip
from os.path import join, exists
from util.file import download_file_if_not_exists
import re
from collections import Counter
import xml.etree.ElementTree as ET
from os.path import join, exists
from zipfile import ZipFile
import numpy as np
import sys
from src.util.file import download_file_if_not_exists
from src.util.file import list_files
"""
RCV2's Nomenclature:

View File

@ -1,15 +1,17 @@
from __future__ import print_function
# import ijson
# from ijson.common import ObjectBuilder
import os, sys
from os.path import join
from bz2 import BZ2File
import os
import pickle
from util.file import list_dirs, list_files, makedirs_if_not_exist
from itertools import islice
import re
from bz2 import BZ2File
from itertools import islice
from os.path import join
from xml.sax.saxutils import escape
import numpy as np
from util.file import list_dirs, list_files
policies = ["IN_ALL_LANGS", "IN_ANY_LANG"]

View File

@ -1,8 +1,9 @@
from nltk.corpus import stopwords
from data.languages import NLTK_LANGMAP
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from src.data.languages import NLTK_LANGMAP
def preprocess_documents(documents, lang):
tokens = NLTKStemTokenizer(lang, verbose=True)

View File

@ -1,8 +1,9 @@
import math
import numpy as np
from scipy.stats import t
from joblib import Parallel, delayed
from scipy.sparse import csr_matrix, csc_matrix
from scipy.stats import t
def get_probs(tpr, fpr, pc):

View File

@ -1,66 +0,0 @@
import os
from torchtext.vocab import Vectors
import torch
from abc import ABC, abstractmethod
from util.SIF_embed import *
class PretrainedEmbeddings(ABC):
def __init__(self):
super().__init__()
@abstractmethod
def vocabulary(self): pass
@abstractmethod
def dim(self): pass
@classmethod
def reindex(cls, words, word2index):
if isinstance(words, dict):
words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0]
source_idx, target_idx = [], []
for i, word in enumerate(words):
if word not in word2index: continue
j = word2index[word]
source_idx.append(i)
target_idx.append(j)
source_idx = np.asarray(source_idx)
target_idx = np.asarray(target_idx)
return source_idx, target_idx
class FastTextWikiNews(Vectors):
url_base = 'Cant auto-download MUSE embeddings'
path = '../embeddings/wiki.multi.{}.vec'
_name = '/wiki.multi.{}.vec'
def __init__(self, cache, language="en", **kwargs):
url = self.url_base.format(language)
name = cache + self._name.format(language)
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
class FastTextMUSE(PretrainedEmbeddings):
def __init__(self, path, lang, limit=None):
super().__init__()
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
def vocabulary(self):
return set(self.embed.stoi.keys())
def dim(self):
return self.embed.dim
def extract(self, words):
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
extraction = torch.zeros((len(words), self.dim()))
extraction[source_idx] = self.embed.vectors[target_idx]
return extraction

View File

@ -1,102 +0,0 @@
from abc import ABC, abstractmethod
import torch, torchtext
# import gensim
# import os
import numpy as np
# class KeyedVectors:
#
# def __init__(self, word2index, weights):
# assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
# index2word = {i:w for w,i in word2index.items()}
# assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
# self.word2index = word2index
# self.index2word = index2word
# self.weights = weights
#
# def extract(self, words):
# dim = self.weights.shape[1]
# v_size = len(words)
#
# source_idx, target_idx = [], []
# for i,word in enumerate(words):
# if word not in self.word2index: continue
# j = self.word2index[word]
# source_idx.append(i)
# target_idx.append(j)
#
# extraction = np.zeros((v_size, dim))
# extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
#
# return extraction
# class PretrainedEmbeddings(ABC):
#
# def __init__(self):
# super().__init__()
#
# @abstractmethod
# def vocabulary(self): pass
#
# @abstractmethod
# def dim(self): pass
#
# @classmethod
# def reindex(cls, words, word2index):
# source_idx, target_idx = [], []
# for i, word in enumerate(words):
# if word not in word2index: continue
# j = word2index[word]
# source_idx.append(i)
# target_idx.append(j)
# source_idx = np.asarray(source_idx)
# target_idx = np.asarray(target_idx)
# return source_idx, target_idx
# class GloVe(PretrainedEmbeddings):
#
# def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
# super().__init__()
# print(f'Loading GloVe pretrained vectors from torchtext')
# self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
# print('Done')
#
# def vocabulary(self):
# return set(self.embed.stoi.keys())
#
# def dim(self):
# return self.embed.dim
#
# def extract(self, words):
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
# extraction = torch.zeros((len(words), self.dim()))
# extraction[source_idx] = self.embed.vectors[target_idx]
# return extraction
# class Word2Vec(PretrainedEmbeddings):
#
# def __init__(self, path, limit=None):
# super().__init__()
# print(f'Loading word2vec pretrained vectors from {path}')
# assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
# self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
# self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
# print('Done')
#
# def vocabulary(self):
# return set(self.word2index.keys())
#
# def dim(self):
# return self.embed.vector_size
#
# def extract(self, words):
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
# extraction = np.zeros((len(words), self.dim()))
# extraction[source_idx] = self.embed.vectors[target_idx]
# extraction = torch.from_numpy(extraction).float()
# return extraction

View File

@ -1,74 +0,0 @@
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
import numpy as np
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
std = np.clip(np.std(x, ddof=1, axis=axis), 1e-5, None)
mean = np.mean(x, axis=axis)
return (x - mean) / std
def supervised_embeddings_tfidf(X,Y):
tfidf_norm = X.sum(axis=0)
tfidf_norm[tfidf_norm==0] = 1
F = (X.T).dot(Y) / tfidf_norm.T
return F
def supervised_embeddings_ppmi(X,Y):
Xbin = X>0
D = X.shape[0]
Pxy = (Xbin.T).dot(Y)/D
Px = Xbin.sum(axis=0)/D
Py = Y.sum(axis=0)/D
F = np.asarray(Pxy/(Px.T*Py))
F = np.maximum(F, 1.0)
F = np.log(F)
return F
def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=25000):
D = X.shape[0]
if D>max_documents:
print(f'sampling {max_documents}')
random_sample = np.random.permutation(D)[:max_documents]
X = X[random_sample]
Y = Y[random_sample]
cell_matrix = get_supervised_matrix(X, Y)
F = get_tsr_matrix(cell_matrix, tsr_score_funtion=tsr_function).T
return F
def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
if max_label_space != 0:
print('computing supervised embeddings...')
nC = Y.shape[1]
if method=='ppmi':
F = supervised_embeddings_ppmi(X, Y)
elif method == 'dotn':
F = supervised_embeddings_tfidf(X, Y)
elif method == 'ig':
F = supervised_embeddings_tsr(X, Y, information_gain)
elif method == 'chi2':
F = supervised_embeddings_tsr(X, Y, chi_square)
if dozscore:
F = zscores(F, axis=0)
# Dumping F-matrix for further studies
dump_it = False
if dump_it:
with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile:
np.savetxt(outfile, F, delimiter='\t')
with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile:
for token in voc.keys():
outfile.write(token+'\n')
return F

View File

@ -1,11 +0,0 @@
#!/usr/bin/env bash
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
logfile=../log/log10run_dl_jrc.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
done

View File

@ -1,11 +0,0 @@
#!/usr/bin/env bash
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
logfile=../log/log10run_dl_rcv.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
done

View File

@ -1,12 +0,0 @@
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
logfile=./results/10run_jrc_final_results.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
done

View File

@ -1,16 +0,0 @@
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
logfile=./results/funnelling_10run_jrc_CIKM.csv
runs='6 7 8 9' #0 1 2 3 4 5
for run in $runs
do
dataset=$dataset_path$run.pickle
#python main_gFun.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated (done up to run5)
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
#python main_gFun.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -c -P -U -r -z --l2
#python main_gFun.py $dataset -o $logfile -c -P -U -S -r -z --l2
done

View File

@ -1,15 +0,0 @@
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
logfile=./results/10run_rcv_final_results.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
done

View File

@ -1,16 +0,0 @@
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
#python main_gFun.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
#python main_gFun.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -c -P -U -r -z --l2
#python main_gFun.py $dataset -o $logfile -c -P -U -S -r -z --l2
done

View File

@ -1,14 +0,0 @@
#!/usr/bin/env bash
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run#
runs='1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
modelpath=/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run$runs
python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath
done
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath

View File

@ -1,329 +0,0 @@
import argparse
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from dataset_builder import MultilingualDataset
from learning.transformers import load_muse_embeddings
from models.lstm_class import RNNMultilingualClassifier
from util.csv_log import CSVLog
from util.early_stop import EarlyStopping
from util.common import *
from util.file import create_if_not_exist
from time import time
from tqdm import tqdm
from util.evaluation import evaluate
from util.file import get_file_name
# import pickle
allowed_nets = {'rnn'}
# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested
def init_Net(nC, multilingual_index, xavier_uniform=True):
net=opt.net
assert net in allowed_nets, f'{net} not supported, valid ones are={allowed_nets}'
# instantiate the required net
if net=='rnn':
only_post = opt.posteriors and (not opt.pretrained) and (not opt.supervised)
if only_post:
print('working on ONLY POST mode')
model = RNNMultilingualClassifier(
output_size=nC,
hidden_size=opt.hidden,
lvocab_size=multilingual_index.l_vocabsize(),
learnable_length=opt.learnable,
lpretrained=multilingual_index.l_embeddings(),
drop_embedding_range=multilingual_index.sup_range,
drop_embedding_prop=opt.sup_drop,
post_probabilities=opt.posteriors,
only_post=only_post,
bert_embeddings=opt.mbert
)
# weight initialization
if xavier_uniform:
for p in model.parameters():
if p.dim() > 1 and p.requires_grad:
nn.init.xavier_uniform_(p)
if opt.tunable:
# this has to be performed *after* Xavier initialization is done,
# otherwise the pretrained embedding parameters will be overrided
model.finetune_pretrained()
return model.cuda()
def set_method_name():
method_name = f'{opt.net}(H{opt.hidden})'
if opt.pretrained:
method_name += f'-Muse'
if opt.supervised:
method_name += f'-WCE'
if opt.posteriors:
method_name += f'-Posteriors'
if opt.mbert:
method_name += f'-mBert'
if (opt.pretrained or opt.supervised) and opt.tunable:
method_name += '-(trainable)'
else:
method_name += '-(static)'
if opt.learnable > 0:
method_name += f'-Learnable{opt.learnable}'
return method_name
def init_optimizer(model, lr):
return torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=opt.weight_decay)
def init_logfile(method_name, opt):
logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
logfile.set_default('dataset', opt.dataset)
logfile.set_default('run', opt.seed)
logfile.set_default('method', method_name)
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
f'and run {opt.seed} already calculated'
return logfile
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
def load_pretrained_embeddings(we_path, langs):
lpretrained = lpretrained_vocabulary = none_dict(langs)
if opt.pretrained:
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
return lpretrained, lpretrained_vocabulary
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def train(model, batcher, ltrain_index, ltrain_posteriors, ltrain_bert, lytr, tinit, logfile, criterion, optim, epoch, method_name):
_dataset_path = opt.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
loss_history = []
model.train()
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
optim.zero_grad()
# _out = model(batch, post, bert_emb, lang)
loss = criterion(model(batch, post, bert_emb, lang), target)
loss.backward()
clip_gradient(model)
optim.step()
loss_history.append(loss.item())
if idx % opt.log_interval == 0:
interval_loss = np.mean(loss_history[-opt.log_interval:])
print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
return mean_loss
def test(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix):
loss_history = []
model.eval()
langs = sorted(ltest_index.keys())
predictions = {l:[] for l in langs}
yte_stacked = {l:[] for l in langs}
batcher.init_offset()
for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), desc='evaluation: '):
logits = model(batch, post, bert_emb, lang)
loss = criterion(logits, target).item()
prediction = predict(logits)
predictions[lang].append(prediction)
yte_stacked[lang].append(target.detach().cpu().numpy())
loss_history.append(loss)
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l:np.vstack(predictions[l]) for l in langs}
l_eval = evaluate(ly, ly_)
metrics = []
for lang in langs:
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix == 'te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
mean_loss = np.mean(loss_history)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
return Mf1
# ----------------------------------------------------------------------------------------------------------------------
def main():
DEBUGGING = False
method_name = set_method_name()
logfile = init_logfile(method_name, opt)
# Loading the dataset
data = MultilingualDataset.load(opt.dataset)
# data.set_view(languages=['it', 'fr']) # Testing with less langs
data.show_dimensions()
langs = data.langs()
l_devel_raw, l_devel_target = data.training(target_as_csr=True)
l_test_raw, l_test_target = data.test(target_as_csr=True)
# Loading the MUSE pretrained embeddings (only if requested)
lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
# lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set
# Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
multilingual_index = MultilingualIndex()
multilingual_index.index(l_devel_raw, l_devel_target, l_test_raw, lpretrained_vocabulary)
multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
multilingual_index.embedding_matrices(lpretrained, opt.supervised)
if opt.posteriors:
if DEBUGGING:
import pickle
with open('/home/andreapdr/funneling_pdr/dumps/posteriors_jrc_run0.pickle', 'rb') as infile:
data_post = pickle.load(infile)
lPtr = data_post[0]
lPva = data_post[1]
lPte = data_post[2]
print('## DEBUGGING MODE: loaded dumped posteriors for jrc run0')
else:
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000)
else:
lPtr, lPva, lPte = None, None, None
if opt.mbert:
_dataset_path = opt.dataset.split('/')[-1].split('_')
_model_folder = _dataset_path[0] + '_' + _dataset_path[-1].replace('.pickle', '')
# print(f'Model Folder: {_model_folder}')
if DEBUGGING:
with open('/home/andreapdr/funneling_pdr/dumps/mBert_jrc_run0.pickle', 'rb') as infile:
data_embed = pickle.load(infile)
tr_bert_embeddings = data_embed[0]
va_bert_embeddings = data_embed[1]
te_bert_embeddings = data_embed[2]
print('## DEBUGGING MODE: loaded dumped mBert embeddings for jrc run0')
else:
tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings \
= multilingual_index.bert_embeddings(f'/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-{_model_folder}/')
else:
tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings = None, None, None
# Model initialization
model = init_Net(data.num_categories(), multilingual_index)
optim = init_optimizer(model, lr=opt.lr)
criterion = torch.nn.BCEWithLogitsLoss().cuda()
lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
tinit = time()
create_if_not_exist(opt.checkpoint_dir)
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
l_train_index, l_train_target = multilingual_index.l_train()
l_val_index, l_val_target = multilingual_index.l_val()
l_test_index = multilingual_index.l_test_index()
print('-'*80)
print('Start training')
for epoch in range(1, opt.nepochs + 1):
train(model, batcher_train, l_train_index, lPtr, tr_bert_embeddings, l_train_target, tinit, logfile, criterion, optim, epoch, method_name)
lr_scheduler.step() # reduces the learning rate
# validation
macrof1 = test(model, batcher_eval, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, epoch, logfile, criterion, 'va')
early_stop(macrof1, epoch)
if opt.test_each>0:
if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
if early_stop.STOP:
print('[early-stop] STOP')
if not opt.plotmode: # with plotmode activated, early-stop is ignored
break
# training is over
# restores the best model according to the Mf1 of the validation set (only when plotmode==False)
# stoptime = early_stop.stop_time - tinit
# stopepoch = early_stop.best_epoch
# logfile.add_row(epoch=stopepoch, measure=f'early-stop', value=early_stop.best_score, timelapse=stoptime)
if opt.plotmode==False:
print('-' * 80)
print('Training over. Performing final evaluation')
# torch.cuda.empty_cache()
model = early_stop.restore_checkpoint()
if opt.val_epochs>0:
print(f'running last {opt.val_epochs} training epochs on the validation set')
for val_epoch in range(1, opt.val_epochs + 1):
batcher_train.init_offset()
train(model, batcher_train, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name)
# final test
print('Training complete: testing')
test(model, batcher_eval, l_test_index, lPte, te_bert_embeddings, l_test_target, tinit, epoch, logfile, criterion, 'te')
# ----------------------------------------------------------------------------------------------------------------------
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings')
parser.add_argument('dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset')
parser.add_argument('--batch-size', type=int, default=50, metavar='int', help='input batch size (default: 100)')
parser.add_argument('--batch-size-test', type=int, default=250, metavar='int', help='batch size for testing (default: 250)')
parser.add_argument('--nepochs', type=int, default=200, metavar='int', help='number of epochs (default: 200)')
parser.add_argument('--patience', type=int, default=10, metavar='int', help='patience for early-stop (default: 10)')
parser.add_argument('--plotmode', action='store_true', default=False, help='in plot mode executes a long run in order '
'to generate enough data to produce trend plots (test-each should be >0. This mode is '
'used to produce plots, and does not perform an evaluation on the test set.')
parser.add_argument('--hidden', type=int, default=512, metavar='int', help='hidden lstm size (default: 512)')
parser.add_argument('--lr', type=float, default=1e-3, metavar='float', help='learning rate (default: 1e-3)')
parser.add_argument('--weight_decay', type=float, default=0, metavar='float', help='weight decay (default: 0)')
parser.add_argument('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', help='dropout probability for the supervised matrix (default: 0.5)')
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
parser.add_argument('--svm-max-docs', type=int, default=1000, metavar='int', help='maximum number of documents by '
'language used to train the calibrated SVMs (only used if --posteriors is active)')
parser.add_argument('--log-interval', type=int, default=10, metavar='int', help='how many batches to wait before printing training status')
parser.add_argument('--log-file', type=str, default='../log/log.csv', metavar='str', help='path to the log csv file')
parser.add_argument('--test-each', type=int, default=0, metavar='int', help='how many epochs to wait before invoking test (default: 0, only at the end)')
parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', help='path to the directory containing checkpoints')
parser.add_argument('--net', type=str, default='rnn', metavar='str', help=f'net, one in {allowed_nets}')
parser.add_argument('--pretrained', action='store_true', default=False, help='use MUSE pretrained embeddings')
parser.add_argument('--supervised', action='store_true', default=False, help='use supervised embeddings')
parser.add_argument('--posteriors', action='store_true', default=False, help='concatenate posterior probabilities to doc embeddings')
parser.add_argument('--learnable', type=int, default=0, metavar='int', help='dimension of the learnable embeddings (default 0)')
parser.add_argument('--val-epochs', type=int, default=1, metavar='int', help='number of training epochs to perform on the '
'validation set once training is over (default 1)')
parser.add_argument('--we-path', type=str, default='../embeddings', metavar='str',
help=f'path to MUSE pretrained embeddings')
parser.add_argument('--max-label-space', type=int, default=300, metavar='int', help='larger dimension allowed for the '
'feature-label embedding (if larger, then PCA with this number of components is applied '
'(default 300)')
parser.add_argument('--force', action='store_true', default=False, help='do not check if this experiment has already been run')
parser.add_argument('--tunable', action='store_true', default=False,
help='pretrained embeddings are tunable from the beginning (default False, i.e., static)')
parser.add_argument('--mbert', action='store_true', default=False,
help='use mBert embeddings')
opt = parser.parse_args()
assert torch.cuda.is_available(), 'CUDA not available'
assert not opt.plotmode or opt.test_each > 0, 'plot mode implies --test-each>0'
# if opt.pickle_dir: opt.pickle_path = join(opt.pickle_dir, f'{opt.dataset}.pickle')
torch.manual_seed(opt.seed)
main()

View File

@ -1,127 +0,0 @@
import os
from dataset_builder import MultilingualDataset
from util.evaluation import *
from optparse import OptionParser
from util.file import exists
from util.results import PolylingualClassificationResults
from util.util import get_learner, get_params
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format",
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv')
parser.add_option("-e", "--mode-embed", dest="mode_embed",
help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
default='MUSE')
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimize hyperparameters", default=False)
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
help="Number of parallel jobs (default is -1, all)", default=-1)
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. "
"If set to 0 it will automatically search for the best number of components. "
"If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)",
default=300)
parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
" If set to 0 it will automatically search for the best number of components", default=300)
parser.add_option("-l", dest="lang", type=str)
if __name__ == '__main__':
(op, args) = parser.parse_args()
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
dataset_file = os.path.basename(op.dataset)
results = PolylingualClassificationResults('./results/PLE_results.csv')
data = MultilingualDataset.load(op.dataset)
data.show_dimensions()
# data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
# data.set_view(languages=[op.lang])
# data.set_view(categories=list(range(10)))
lXtr, lytr = data.training()
lXte, lyte = data.test()
if op.set_c != -1:
meta_parameters = None
else:
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
# Embeddings and WCE config
_available_mode = ['none', 'unsupervised', 'supervised', 'both']
_available_type = ['MUSE', 'FastText']
assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}'
assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}'
if op.mode_embed == 'none':
config = {'unsupervised': False,
'supervised': False,
'we_type': None}
_config_id = 'None'
elif op.mode_embed == 'unsupervised':
config = {'unsupervised': True,
'supervised': False,
'we_type': op.we_type}
_config_id = 'M'
elif op.mode_embed == 'supervised':
config = {'unsupervised': False,
'supervised': True,
'we_type': None}
_config_id = 'F'
elif op.mode_embed == 'both':
config = {'unsupervised': True,
'supervised': True,
'we_type': op.we_type}
_config_id = 'M+F'
config['reduction'] = 'PCA'
config['max_label_space'] = op.max_labels_S
config['dim_reduction_unsupervised'] = op.max_labels_U
# config['post_pca'] = op.post_pca
# config['plot_covariance_matrices'] = True
result_id = dataset_file + 'MLE_andrea' + _config_id + ('_optimC' if op.optimc else '')
ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/',
config = config,
learner=get_learner(calibrate=False),
c_parameters=get_params(dense=False),
n_jobs=op.n_jobs)
print('# Fitting ...')
ple.fit(lXtr, lytr)
print('# Evaluating ...')
ple_eval = evaluate_method(ple, lXte, lyte)
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = ple_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
results.add_row('MLE', 'svm', _config_id, config['we_type'],
'no','no', op.optimc, op.dataset.split('/')[-1], ple.time,
lang, macrof1, microf1, macrok, microk, '')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

View File

@ -1,155 +0,0 @@
import os
from dataset_builder import MultilingualDataset
# from learning.learners import *
# from learning.learners import FunnellingMultimodal
from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting
from util.evaluation import *
from optparse import OptionParser
from util.file import exists
from util.results import PolylingualClassificationResults
from sklearn.svm import SVC
parser = OptionParser()
# parser.add_option("-d", "--dataset", dest="dataset",
# help="Path to the multilingual dataset processed and stored in .pickle format",
# default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv')
parser.add_option("-P", "--probs", dest="posteriors", action='store_true',
help="Add posterior probabilities to the document embedding representation", default=False)
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimize hyperparameters", default=False)
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
help="Number of parallel jobs (default is -1, all)", default=-1)
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
default=300)
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
help="Remove common component when computing dot product of word embedding matrices", default=False)
# parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
# " If set to 0 it will automatically search for the best number of components", default=300)
# parser.add_option("-a", dest="post_pca",
# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
# "embedding space", default=False)
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
def get_params(dense=False):
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf' if dense else 'linear'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
#######################################################################################################################
if __name__ == '__main__':
(op, args) = parser.parse_args()
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
dataset = args[0]
assert exists(dataset), 'Unable to find file '+str(dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
dataset_file = os.path.basename(dataset)
results = PolylingualClassificationResults(op.output)
data = MultilingualDataset.load(dataset)
data.show_dimensions()
lXtr, lytr = data.training()
lXte, lyte = data.test()
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
# result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \
f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}'
print(f'{result_id}')
# text preprocessing
tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
lXtr = tfidfvectorizer.fit_transform(lXtr, lytr)
lXte = tfidfvectorizer.transform(lXte)
lV = tfidfvectorizer.vocabulary()
classifiers = []
if op.posteriors:
classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
if op.supervised:
classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S)))
if op.pretrained:
classifiers.append(FeatureSet2Posteriors(MuseEmbedder(op.we_path, lV=lV)))
classifier = Voting(*classifiers)
print('# Fitting ...')
classifier.fit(lXtr, lytr)
print('\n# Evaluating ...')
l_eval = evaluate_method(classifier, lXte, lyte)
# renaming arguments to be printed on log
_id = ''
_id_conf = [op.posteriors, op.supervised, op.pretrained]
_id_name = ['+P', '+W', '+M']
for i, conf in enumerate(_id_conf):
if conf:
_id += _id_name[i]
_id = _id.lstrip('+')
_dataset_path = dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
results.add_row(method='Voting',
learner='svm',
optimp=op.optimc,
sif=op.sif,
zscore='todo',
l2='todo',
wescaler='todo',
pca=op.max_labels_S,
id=_id,
dataset=dataset_id,
time='todo',
lang=lang,
macrof1=macrof1,
microf1=microf1,
macrok=macrok,
microk=microk,
notes='')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

View File

@ -1,390 +0,0 @@
from dataset_builder import MultilingualDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
from util.common import predict
from time import time
from util.csv_log import CSVLog
from util.evaluation import evaluate
from util.early_stop import EarlyStopping
from torch.optim.lr_scheduler import StepLR
from sklearn.model_selection import train_test_split
from copy import deepcopy
import argparse
# from torch.utils.tensorboard import SummaryWriter
def check_sentences(sentences):
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
for sentence in sentences:
converted = [tokenizer._convert_id_to_token(token) for token in sentence.numpy() if token != 0]
print(converted)
return
def get_model(n_out):
print('# Initializing model ...')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
return model
def set_method_name():
return 'mBERT'
def init_optimizer(model, lr):
# return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)],
'weight_decay': opt.weight_decay},
{'params': [p for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)],
'weight_decay': opt.weight_decay}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
return optimizer
def init_logfile(method_name, opt):
logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
logfile.set_default('dataset', opt.dataset)
logfile.set_default('run', opt.seed)
logfile.set_default('method', method_name)
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
f'and run {opt.seed} already calculated'
return logfile
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def get_dataset_name(datapath):
possible_splits = [str(i) for i in range(10)]
splitted = datapath.split('_')
id_split = splitted[-1].split('.')[0][-1]
if id_split in possible_splits:
dataset_name = splitted[0].split('/')[-1]
return f'{dataset_name}_run{id_split}'
elif splitted[-2].split('.')[0] == 'full':
dataset_name = splitted[0].split('/')[-1]
return f'{dataset_name}_fullrun'
def load_datasets(datapath):
data = MultilingualDataset.load(datapath)
# data.set_view(languages=['it']) #, categories=[0, 1, 2, 3, 4]) # Testing with less langs
data.show_dimensions()
l_devel_raw, l_devel_target = data.training(target_as_csr=False)
l_test_raw, l_test_target = data.test(target_as_csr=False)
return l_devel_raw, l_devel_target, l_test_raw, l_test_target
def do_tokenization(l_dataset, max_len=512, verbose=True):
if verbose:
print('# Starting Tokenization ...')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
langs = l_dataset.keys()
l_tokenized = {}
for lang in langs:
l_tokenized[lang] = tokenizer(l_dataset[lang],
truncation=True,
max_length=max_len,
padding='max_length')
return l_tokenized
class TrainingDataset(Dataset):
"""
data: dict of lang specific tokenized data
labels: dict of lang specific targets
"""
def __init__(self, data, labels):
self.langs = data.keys()
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
for i, lang in enumerate(self.langs):
_data = data[lang]['input_ids']
_data = np.array(_data)
_labels = labels[lang]
_lang_value = np.full(len(_data), self.lang_ids[lang])
if i == 0:
self.data = _data
self.labels = _labels
self.lang_index = _lang_value
else:
self.data = np.vstack((self.data, _data))
self.labels = np.vstack((self.labels, _labels))
self.lang_index = np.concatenate((self.lang_index, _lang_value))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.data[idx]
y = self.labels[idx]
lang = self.lang_index[idx]
return x, torch.tensor(y, dtype=torch.float), lang
def get_lang_ids(self):
return self.lang_ids
def get_nclasses(self):
if hasattr(self, 'labels'):
return len(self.labels[0])
else:
print('Method called before init!')
def freeze_encoder(model):
for param in model.base_model.parameters():
param.requires_grad = False
return model
def check_param_grad_status(model):
print('#' * 50)
print('Model paramater status:')
for name, child in model.named_children():
trainable = False
for param in child.parameters():
if param.requires_grad:
trainable = True
if not trainable:
print(f'{name} is frozen')
else:
print(f'{name} is not frozen')
print('#' * 50)
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer):
_dataset_path = opt.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
loss_history = []
model.train()
for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
optim.zero_grad()
out = model(batch.cuda())
logits = out[0]
loss = criterion(logits, target.cuda())
loss.backward()
# clip_gradient(model)
optim.step()
loss_history.append(loss.item())
if writer is not None:
_n_step = (epoch - 1) * (len(train_dataloader)) + idx
writer.add_scalar('Loss_step/Train', loss, _n_step)
# Check tokenized sentences consistency
# check_sentences(batch.cpu())
if idx % opt.log_interval == 0:
interval_loss = np.mean(loss_history[-opt.log_interval:])
print(
f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
return mean_loss
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix, writer):
print('# Validating model ...')
loss_history = []
model.eval()
langs = lang_ids.keys()
id_2_lang = {v: k for k, v in lang_ids.items()}
predictions = {l: [] for l in langs}
yte_stacked = {l: [] for l in langs}
for batch, target, lang_idx in test_dataloader:
out = model(batch.cuda())
logits = out[0]
loss = criterion(logits, target.cuda()).item()
prediction = predict(logits)
loss_history.append(loss)
# Assigning prediction to dict in predictions and yte_stacked according to lang_idx
for i, pred in enumerate(prediction):
lang_pred = id_2_lang[lang_idx.numpy()[i]]
predictions[lang_pred].append(pred)
yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
ly = {l: np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l: np.vstack(predictions[l]) for l in langs}
l_eval = evaluate(ly, ly_)
metrics = []
for lang in langs:
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix == 'te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
if writer is not None:
writer.add_scalars('Eval Metrics', {'Mf1': Mf1, 'mF1': mF1, 'MK': MK, 'mk':mk}, epoch)
mean_loss = np.mean(loss_history)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
return Mf1
def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
l_split_va = deepcopy(l_tokenized_tr)
l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
l_split_tr = deepcopy(l_tokenized_tr)
l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
for lang in l_tokenized_tr.keys():
val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[
lang] = \
train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size,
random_state=seed, shuffle=True)
return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
def main():
print('Running main ...')
DATAPATH = opt.dataset
MAX_LEN = 512
method_name = set_method_name()
logfile = init_logfile(method_name, opt)
l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN)
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target,
val_prop=0.2, max_val=2000,
seed=opt.seed)
l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN)
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False)
# Initializing model
nC = tr_dataset.get_nclasses()
model = get_model(nC)
model = model.cuda()
criterion = torch.nn.BCEWithLogitsLoss().cuda()
optim = init_optimizer(model, lr=opt.lr)
lr_scheduler = StepLR(optim, step_size=25, gamma=0.1)
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
checkpoint=f'/home/andreapdr/funneling_pdr/hug_checkpoint/{method_name}-{get_dataset_name(opt.dataset)}',
is_bert=True)
# Freezing encoder
# model = freeze_encoder(model)
check_param_grad_status(model)
# Tensorboard logger
# writer = SummaryWriter('../log/tensorboard_logs/')
# Training loop
tinit = time()
lang_ids = va_dataset.lang_ids
for epoch in range(1, opt.nepochs + 1):
print('# Start Training ...')
train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer=None)
lr_scheduler.step() # reduces the learning rate
# Validation
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va', writer=None)
early_stop(macrof1, epoch)
if opt.test_each > 0:
if (opt.plotmode and (epoch == 1 or epoch % opt.test_each == 0)) or (
not opt.plotmode and epoch % opt.test_each == 0 and epoch < opt.nepochs):
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None)
if early_stop.STOP:
print('[early-stop] STOP')
if not opt.plotmode:
break
if not opt.plotmode:
print('-' * 80)
print('Training over. Performing final evaluation')
model = early_stop.restore_checkpoint()
model = model.cuda()
if opt.val_epochs > 0:
print(f'running last {opt.val_epochs} training epochs on the validation set')
for val_epoch in range(1, opt.val_epochs + 1):
train(model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile, writer=None)
# final test
print('Training complete: testing')
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None)
# writer.flush()
# writer.close()
exit('Code Executed!')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model')
parser.add_argument('--dataset', type=str,
default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
metavar='datasetpath', help=f'path to the pickled dataset')
parser.add_argument('--nepochs', type=int, default=200, metavar='int',
help='number of epochs (default: 200)')
parser.add_argument('--lr', type=float, default=2e-5, metavar='float',
help='learning rate (default: 2e-5)')
parser.add_argument('--weight_decay', type=float, default=0, metavar='float',
help='weight decay (default: 0)')
parser.add_argument('--patience', type=int, default=10, metavar='int',
help='patience for early-stop (default: 10)')
parser.add_argument('--log-interval', type=int, default=20, metavar='int',
help='how many batches to wait before printing training status')
parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str',
help='path to the log csv file')
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
parser.add_argument('--force', action='store_true', default=False,
help='do not check if this experiment has already been run')
parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str',
help='path to the directory containing checkpoints')
parser.add_argument('--plotmode', action='store_true', default=False,
help='in plot mode executes a long run in order '
'to generate enough data to produce trend plots (test-each should be >0. This mode is '
'used to produce plots, and does not perform an evaluation on the test set.')
parser.add_argument('--test-each', type=int, default=0, metavar='int',
help='how many epochs to wait before invoking test (default: 0, only at the end)')
parser.add_argument('--val-epochs', type=int, default=1, metavar='int',
help='number of training epochs to perform on the validation set once training is over (default 1)')
opt = parser.parse_args()
# Testing different parameters ...
opt.weight_decay = 0.01
opt.lr = 1e-5
opt.patience = 5
main()
# TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size

View File

@ -1,110 +0,0 @@
from experiment_scripts.main_mbert import *
import pickle
class ExtractorDataset(Dataset):
"""
data: dict of lang specific tokenized data
labels: dict of lang specific targets
"""
def __init__(self, data):
self.langs = data.keys()
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
for i, lang in enumerate(self.langs):
_data = data[lang]['input_ids']
_data = np.array(_data)
_lang_value = np.full(len(_data), self.lang_ids[lang])
if i == 0:
self.data = _data
self.lang_index = _lang_value
else:
self.data = np.vstack((self.data, _data))
self.lang_index = np.concatenate((self.lang_index, _lang_value))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.data[idx]
lang = self.lang_index[idx]
return x, lang
def get_lang_ids(self):
return self.lang_ids
def feature_extractor(data, lang_ids, model_path='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0/'):
print('# Feature Extractor Mode...')
from transformers import BertConfig
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, num_labels=300)
model = BertForSequenceClassification.from_pretrained(model_path,
config=config).cuda()
"""
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
"""
all_batch_embeddings = {}
id2lang = {v:k for k,v in lang_ids.items()}
with torch.no_grad():
for batch, target, lang_idx in data:
out = model(batch.cuda())
last_hidden_state = out[1][-1]
batch_embeddings = last_hidden_state[:, 0, :]
for i, l_idx in enumerate(lang_idx.numpy()):
if id2lang[l_idx] not in all_batch_embeddings.keys():
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
else:
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
batch_embeddings[i].detach().cpu().numpy()))
return all_batch_embeddings, id2lang
def main():
print('Running main ...')
print(f'Model path: {opt.modelpath}\nDataset path: {opt.dataset}')
DATAPATH = opt.dataset
MAX_LEN = 512
l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN)
l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN)
tr_dataset = TrainingDataset(l_tokenized_tr, l_devel_target)
tr_lang_ids = tr_dataset.lang_ids
te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
te_lang_ids = te_dataset.lang_ids
tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc embeddings
te_dataloader = DataLoader(te_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc
tr_all_batch_embeddings, id2lang_tr = feature_extractor(tr_dataloader, tr_lang_ids, opt.modelpath) # Extracting doc embed for devel
with open(f'{opt.modelpath}/TR_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile:
pickle.dump((tr_all_batch_embeddings, id2lang_tr), outfile)
te_all_batch_embeddings, id2lang_te = feature_extractor(te_dataloader, te_lang_ids, opt.modelpath) # Extracting doc embed for test
with open(f'{opt.modelpath}/TE_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile:
pickle.dump((te_all_batch_embeddings, id2lang_te), outfile)
exit('Extraction completed!')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='mBert model document embedding extractor')
parser.add_argument('--dataset', type=str,
default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
metavar='datasetpath', help=f'path to the pickled dataset')
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
parser.add_argument('--modelpath', type=str, default='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0',
metavar='modelpath', help=f'path to pre-trained mBert model')
opt = parser.parse_args()
main()

View File

@ -1,49 +0,0 @@
import os
from dataset_builder import MultilingualDataset
from optparse import OptionParser
from util.file import exists
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
parser = OptionParser(usage="usage: %prog datapath [options]")
(op, args) = parser.parse_args()
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
dataset = args[0]
assert exists(dataset), 'Unable to find file '+str(dataset)
dataset_file = os.path.basename(dataset)
data = MultilingualDataset.load(dataset)
data.set_view(languages=['it'])
data.show_dimensions()
lXtr, lytr = data.training()
lXte, lyte = data.test()
vect_lXtr = dict()
vectorizer = CountVectorizer()
vect_lXtr['it'] = vectorizer.fit_transform(lXtr['it'])
# print(type(vect_lXtr['it']))
corr = vect_lXtr['it'].T.dot(lytr['it'])
# print(corr.shape)
sum_correlated_class = corr.sum(axis=0)
print(len(sum_correlated_class))
print(sum_correlated_class.max())
w2idx = vectorizer.vocabulary_
idx2w = {v:k for k,v in w2idx.items()}
word_tot_corr = corr.sum(axis=1)
print(word_tot_corr.shape)
dict_word_tot_corr = {v:k for k,v in enumerate(word_tot_corr)}
sorted_word_tot_corr = np.sort(word_tot_corr)
sorted_word_tot_corr = sorted_word_tot_corr[len(sorted_word_tot_corr)-200:]
top_idx = [dict_word_tot_corr[k] for k in sorted_word_tot_corr]
print([idx2w[idx] for idx in top_idx])
print([elem for elem in top_idx])
print(corr[8709])
print('Finished...')

View File

@ -1,34 +0,0 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
logfile=./results/final_combinations_jrc.csv
#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
# aggregation=concatenation
#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2
#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2
#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2
#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2
#
##FeatureSetToPosteriors (aggregation mean)
python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
##FeatureSetToPosteriors
#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
#MajorityVoting
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r

View File

@ -1,31 +0,0 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
logfile=./results/final_combinations_rcv.csv
#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
# aggregation=concatenation
#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2
#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2
#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2
#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2
#
##FeatureSetToPosteriors (aggregation mean)
python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
##FeatureSetToPosteriors
#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 --allprob
#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
#MajorityVoting
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r

View File

@ -1,31 +0,0 @@
#!/usr/bin/env bash
logfile=../log/log_pre_jrc.csv
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20

View File

@ -1,30 +0,0 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20

View File

@ -1,16 +0,0 @@
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
seeds='5' #2 3 4 5 6 7 8 9 10'
for seed in $seeds
do
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed --force
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed --force
done

View File

@ -1,20 +0,0 @@
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
seeds='1 ' #2 3 4 5' # 6 7 8 9 10'
for seed in $seeds
do
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed
# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed
# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed
done

View File

@ -1,16 +0,0 @@
#!/usr/bin/env bash
#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
#logfile=../log/log_FunBert_jrc.csv
#
#runs='0 1 2 3 4'
#for run in $runs
#do
# dataset=$dataset_path$run.pickle
# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile #--tunable
#done
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
logfile=../log/log_FunBert_fulljrc_static.csv
python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile

View File

@ -1,16 +0,0 @@
#!/usr/bin/env bash
#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
#logfile=../log/log_FunBert_rcv_static.csv
#
#runs='0 1 2 3 4'
#for run in $runs
#do
# dataset=$dataset_path$run.pickle
# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile
#done
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
logfile=../log/log_FunBert_fullrcv_static.csv
python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile

View File

@ -1,15 +0,0 @@
#!/usr/bin/env bash
#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
#logfile=../log/log_mBert_jrc_NEW.csv
#
#runs='0 1 2 3 4'
#for run in $runs
#do
# dataset=$dataset_path$run.pickle
# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
#done
logfile=../log/log_mBert_fulljrc.csv
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50

View File

@ -1,15 +0,0 @@
#!/usr/bin/env bash
#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
#logfile=../log/log_mBert_rcv_NEW.csv
#
#runs='0 1 2 3 4'
#for run in $runs
#do
# dataset=$dataset_path$run.pickle
# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
#done
logfile=../log/log_mBert_fullrcv.csv
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=30 --patience 3

View File

@ -1,45 +0,0 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
######################################## POSTERIORS
# Posteriors
python main_multimodal_cls.py $dataset -P # + zscore
python main_multimodal_cls.py $dataset -P -z # +l2norm
python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
######################################### WCE
#WCE supervised
python main_multimodal_cls.py $dataset -S # + zscore
python main_multimodal_cls.py $dataset -S -z # +l2norm
python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 # +feature weight + pca
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 # + SIF
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig # + pca
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi
################################# MUSE
# MUSE unsupervised
python main_multimodal_cls.py $dataset -U # + zscore
python main_multimodal_cls.py $dataset -U -z # +l2norm
python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi

View File

@ -1,45 +0,0 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
######################################## POSTERIORS
# Posteriors
python main_multimodal_cls.py $dataset -P # + zscore
python main_multimodal_cls.py $dataset -P -z # +l2norm
python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
######################################### WCE
#WCE supervised
python main_multimodal_cls.py $dataset -S # + zscore
python main_multimodal_cls.py $dataset -S -z # +l2norm
python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 # +feature weight + pca
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 # + SIF
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig # + pca
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi
################################# MUSE
# MUSE unsupervised
python main_multimodal_cls.py $dataset -U # + zscore
python main_multimodal_cls.py $dataset -U -z # +l2norm
python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi

View File

@ -1,6 +0,0 @@
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
seeds='1 2 3 4 5 6 7 8 9 10'
for seed in $seeds
do
python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised --nepochs 50 --seed $seed
done

124
src/funnelling.py Normal file
View File

@ -0,0 +1,124 @@
from src.models.learners import *
from src.util.common import _normalize
from src.view_generators import VanillaFunGen
class DocEmbedderList:
"""
Class that takes care of calling fit and transform function for every init embedder. Every ViewGenerator should be
contained by this class in order to seamlessly train the overall architecture.
"""
def __init__(self, embedder_list, probabilistic=True):
"""
Init the DocEmbedderList.
:param embedder_list: list of embedders to be deployed
:param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not
"""
assert len(embedder_list) != 0, 'Embedder list cannot be empty!'
self.embedders = embedder_list
self.probabilistic = probabilistic
if probabilistic:
_tmp = []
for embedder in self.embedders:
if isinstance(embedder, VanillaFunGen):
_tmp.append(embedder)
else:
_tmp.append(FeatureSet2Posteriors(embedder))
self.embedders = _tmp
def fit(self, lX, ly):
"""
Fit all the ViewGenerators contained by DocEmbedderList.
:param lX:
:param ly:
:return: self
"""
for embedder in self.embedders:
embedder.fit(lX, ly)
return self
def transform(self, lX):
"""
Project documents by means of every ViewGenerators. Projections are then averaged together and returned.
:param lX:
:return: common latent space (averaged).
"""
langs = sorted(lX.keys())
lZparts = {lang: None for lang in langs}
for embedder in self.embedders:
lZ = embedder.transform(lX)
for lang in langs:
Z = lZ[lang]
if lZparts[lang] is None:
lZparts[lang] = Z
else:
lZparts[lang] += Z
n_embedders = len(self.embedders)
return {lang: lZparts[lang]/n_embedders for lang in langs} # Averaging feature spaces
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
class FeatureSet2Posteriors:
"""
Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of
a multiclass SVM.
"""
def __init__(self, embedder, l2=True, n_jobs=-1):
"""
Init the class.
:param embedder: ViewGen, view generators which does not natively outputs posterior probabilities.
:param l2: bool, whether to apply or not L2 normalization to the projection
:param n_jobs: int, number of concurrent workers.
"""
self.embedder = embedder
self.l2 = l2
self.n_jobs = n_jobs
self.prob_classifier = MetaClassifier(
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
def fit(self, lX, ly):
lZ = self.embedder.fit_transform(lX, ly)
self.prob_classifier.fit(lZ, ly)
return self
def transform(self, lX):
lP = self.predict_proba(lX)
lP = _normalize(lP, self.l2)
return lP
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
def predict(self, lX):
lZ = self.embedder.transform(lX)
return self.prob_classifier.predict(lZ)
def predict_proba(self, lX):
lZ = self.embedder.transform(lX)
return self.prob_classifier.predict_proba(lZ)
class Funnelling:
"""
Funnelling Architecture. It is composed by two tiers. The first-tier is a set of heterogeneous document embedders.
The second-tier (i.e., the metaclassifier), operates the classification of the common latent space computed by
the first-tier learners.
"""
def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1):
self.first_tier = first_tier
self.meta = meta_classifier
self.n_jobs = n_jobs
def fit(self, lX, ly):
print('## Fitting first-tier learners!')
lZ = self.first_tier.fit_transform(lX, ly)
print('## Fitting meta-learner!')
self.meta.fit(lZ, ly)
def predict(self, lX):
lZ = self.first_tier.transform(lX)
ly = self.meta.predict(lZ)
return ly

View File

@ -1,849 +0,0 @@
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader
from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain
from embeddings.embeddings import FastTextMUSE
from embeddings.supervised import supervised_embeddings_tfidf, zscores
from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling
from sklearn.decomposition import PCA
from scipy.sparse import hstack
from util_transformers.StandardizeTransformer import StandardizeTransformer
from util.SIF_embed import remove_pc
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
from models.mBert import *
from models.lstm_class import *
from util.csv_log import CSVLog
from util.file import get_file_name
from util.early_stop import EarlyStopping
from util.common import *
import time
# ------------------------------------------------------------------
# Data Processing
# ------------------------------------------------------------------
class FeatureWeight:
def __init__(self, weight='tfidf', agg='mean'):
assert weight in ['tfidf', 'pmi', 'ig'] or callable(
weight), 'weight should either be "tfidf" or a callable function'
assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"'
self.weight = weight
self.agg = agg
self.fitted = False
if weight == 'pmi':
self.weight = pointwise_mutual_information
elif weight == 'ig':
self.weight = information_gain
def fit(self, lX, ly):
if not self.fitted:
if self.weight == 'tfidf':
self.lF = {l: np.ones(X.shape[1]) for l, X in lX.items()}
else:
self.lF = {}
for l in lX.keys():
X, y = lX[l], ly[l]
print(f'getting supervised cell-matrix lang {l}')
tsr_matrix = get_tsr_matrix(get_supervised_matrix(X, y), tsr_score_funtion=self.weight)
if self.agg == 'max':
F = tsr_matrix.max(axis=0)
elif self.agg == 'mean':
F = tsr_matrix.mean(axis=0)
self.lF[l] = F
self.fitted = True
return self
def transform(self, lX):
return {lang: csr_matrix.multiply(lX[lang], self.lF[lang]) for lang in lX.keys()}
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
# ------------------------------------------------------------------
# View Generators (aka first-tier learners)
# ------------------------------------------------------------------
class PosteriorProbabilitiesEmbedder:
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1):
self.fist_tier_learner = first_tier_learner
self.fist_tier_parameters = first_tier_parameters
self.l2 = l2
self.n_jobs = n_jobs
self.doc_projector = NaivePolylingualClassifier(
self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
)
self.requires_tfidf = True
def fit(self, lX, lY, lV=None, called_by_viewgen=False):
if not called_by_viewgen:
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
print('### Posterior Probabilities View Generator (X)')
print('fitting the projectors... {}'.format(lX.keys()))
self.doc_projector.fit(lX, lY)
return self
def transform(self, lX):
lZ = self.predict_proba(lX)
lZ = _normalize(lZ, self.l2)
return lZ
def fit_transform(self, lX, ly=None, lV=None):
return self.fit(lX, ly).transform(lX)
def best_params(self):
return self.doc_projector.best_params()
def predict(self, lX, ly=None):
return self.doc_projector.predict(lX)
def predict_proba(self, lX, ly=None):
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents')
return self.doc_projector.predict_proba(lX)
def _get_output_dim(self):
return len(self.doc_projector.model['da'].model.classes_)
class MuseEmbedder:
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False):
self.path = path
self.lV = lV
self.l2 = l2
self.n_jobs = n_jobs
self.featureweight = featureweight
self.sif = sif
self.requires_tfidf = True
def fit(self, lX, ly, lV=None):
assert lV is not None or self.lV is not None, 'lV not specified'
print('### MUSE View Generator (M)')
print(f'Loading fastText pretrained vectors for languages {list(lX.keys())}...')
self.langs = sorted(lX.keys())
self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
lWordList = {l: self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
self.MUSE = {l: Muse.extract(lWordList[l]).numpy() for l, Muse in self.MUSE.items()}
self.featureweight.fit(lX, ly)
return self
def transform(self, lX):
MUSE = self.MUSE
lX = self.featureweight.transform(lX)
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs
)
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
lMuse = _normalize(lMuse, self.l2)
return lMuse
def fit_transform(self, lX, ly, lV):
return self.fit(lX, ly, lV).transform(lX)
def _get_wordlist_from_word2index(self, word2index):
return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0]
def _get_output_dim(self):
return self.MUSE['da'].shape[1]
class WordClassEmbedder:
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False):
self.n_jobs = n_jobs
self.l2 = l2
self.max_label_space = max_label_space
self.featureweight = featureweight
self.sif = sif
self.requires_tfidf = True
def fit(self, lX, ly, lV=None):
print('### WCE View Generator (M)')
print('Computing supervised embeddings...')
self.langs = sorted(lX.keys())
WCE = Parallel(n_jobs=self.n_jobs)(
delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs
)
self.lWCE = {l: WCE[i] for i, l in enumerate(self.langs)}
self.featureweight.fit(lX, ly)
return self
def transform(self, lX):
lWCE = self.lWCE
lX = self.featureweight.transform(lX)
XdotWCE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], lWCE[lang], self.sif) for lang in self.langs
)
lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
lwce = _normalize(lwce, self.l2)
return lwce
def fit_transform(self, lX, ly, lV=None):
return self.fit(lX, ly).transform(lX)
def _get_output_dim(self):
return 73 # TODO !
class MBertEmbedder:
def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None,
nC=None):
self.doc_embed_path = doc_embed_path
self.patience = patience
self.checkpoint_dir = checkpoint_dir
self.fitted = False
self.requires_tfidf = False
if path_to_model is None and nC is not None:
self.model = None
else:
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
num_labels=nC)
self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda()
self.fitted = True
def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1):
print('### mBERT View Generator (B)')
if self.fitted is True:
print('Bert model already fitted!')
return self
print('Fine-tune mBert on the given dataset.')
l_tokenized_tr = do_tokenization(lX, max_len=512)
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly,
val_prop=0.2, max_val=2000,
seed=seed) # TODO: seed
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
nC = tr_dataset.get_nclasses()
model = get_model(nC)
model = model.cuda()
criterion = torch.nn.BCEWithLogitsLoss().cuda()
optim = init_optimizer(model, lr=lr, weight_decay=0.01)
lr_scheduler = StepLR(optim, step_size=25, gamma=0.1)
early_stop = EarlyStopping(model, optimizer=optim, patience=self.patience,
checkpoint=self.checkpoint_dir,
is_bert=True)
# Training loop
logfile = '../log/log_mBert_extractor.csv'
method_name = 'mBert_feature_extractor'
tinit = time()
lang_ids = va_dataset.lang_ids
for epoch in range(1, nepochs + 1):
print('# Start Training ...')
train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile)
lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
# Validation
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
early_stop(macrof1, epoch)
if early_stop.STOP:
print('[early-stop] STOP')
break
model = early_stop.restore_checkpoint()
self.model = model.cuda()
if val_epochs > 0:
print(f'running last {val_epochs} training epochs on the validation set')
for val_epoch in range(1, val_epochs + 1):
train(self.model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile)
self.fitted = True
return self
def transform(self, lX):
assert self.fitted is True, 'Calling transform without any initialized model! - call init first or on init' \
'pass the "path_to_model" arg.'
print('Obtaining document embeddings from pretrained mBert ')
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
feat_dataset = ExtractorDataset(l_tokenized_X)
feat_lang_ids = feat_dataset.lang_ids
dataloader = DataLoader(feat_dataset, batch_size=64)
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
return all_batch_embeddings
def fit_transform(self, lX, ly, lV=None):
return self.fit(lX, ly).transform(lX)
class RecurrentEmbedder:
def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3,
we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10,
test_each=0, checkpoint_dir='../checkpoint', model_path=None):
self.pretrained = pretrained
self.supervised = supervised
self.concat = concat
self.requires_tfidf = False
self.multilingual_dataset = multilingual_dataset
self.model = None
self.we_path = we_path
self.langs = multilingual_dataset.langs()
self.hidden_size = hidden_size
self.sup_drop = sup_drop
self.posteriors = posteriors
self.patience = patience
self.checkpoint_dir = checkpoint_dir
self.test_each = test_each
self.options = options
self.seed = options.seed
self.is_trained = False
## INIT MODEL for training
self.lXtr, self.lytr = self.multilingual_dataset.training(target_as_csr=True)
self.lXte, self.lyte = self.multilingual_dataset.test(target_as_csr=True)
self.nC = self.lyte[self.langs[0]].shape[1]
lpretrained, lpretrained_vocabulary = self._load_pretrained_embeddings(self.we_path, self.langs)
self.multilingual_index = MultilingualIndex()
self.multilingual_index.index(self.lXtr, self.lytr, self.lXte, lpretrained_vocabulary)
self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed)
self.multilingual_index.embedding_matrices(lpretrained, self.supervised)
if model_path is not None:
self.is_trained = True
self.model = torch.load(model_path)
else:
self.model = self._init_Net()
self.optim = init_optimizer(self.model, lr=lr)
self.criterion = torch.nn.BCEWithLogitsLoss().cuda()
self.lr_scheduler = StepLR(self.optim, step_size=25, gamma=0.5)
self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience,
checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}')
# Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities
self.posteriorEmbedder = MetaClassifier(
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs)
def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1):
print('### Gated Recurrent Unit View Generator (G)')
# could be better to init model here at first .fit() call!
if self.model is None:
print('TODO: Init model!')
if not self.is_trained:
# Batchify input
self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed)
l_train_index, l_train_target = self.multilingual_index.l_train()
l_val_index, l_val_target = self.multilingual_index.l_val()
l_test_index = self.multilingual_index.l_test_index()
batcher_train = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
lpad=self.multilingual_index.l_pad())
batcher_eval = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
lpad=self.multilingual_index.l_pad())
# Train loop
print('Start training')
method_name = 'gru_view_generator'
logfile = init_logfile_nn(method_name, self.options)
tinit = time.time()
for epoch in range(1, nepochs + 1):
train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target,
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
ltrain_bert=None)
self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
# validation step
macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch,
logfile, self.criterion, 'va')
self.early_stop(macrof1, epoch)
if self.test_each > 0:
test_gru(self.model, batcher_eval, l_test_index, None, None, self.lyte, tinit, epoch,
logfile, self.criterion, 'te')
if self.early_stop.STOP:
print('[early-stop] STOP')
print('Restoring best model...')
break
self.model = self.early_stop.restore_checkpoint()
print(f'running last {val_epochs} training epochs on the validation set')
for val_epoch in range(1, val_epochs+1):
batcher_train.init_offset()
train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target,
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
ltrain_bert=None)
self.is_trained = True
# Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities
lX = self._get_doc_embeddings(lX)
# Fit a ''multi-lingual'' SVM on the generated doc embeddings
self.posteriorEmbedder.fit(lX, ly)
return self
def transform(self, lX, batch_size=64):
lX = self._get_doc_embeddings(lX)
return self.posteriorEmbedder.predict_proba(lX)
def fit_transform(self, lX, ly, lV=None):
# TODO
return 0
def _get_doc_embeddings(self, lX, batch_size=64):
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
print('Generating document embeddings via GRU')
data = {}
for lang in lX.keys():
indexed = index(data=lX[lang],
vocab=self.multilingual_index.l_index[lang].word2index,
known_words=set(self.multilingual_index.l_index[lang].word2index.keys()),
analyzer=self.multilingual_index.l_vectorizer.get_analyzer(lang),
unk_index=self.multilingual_index.l_index[lang].unk_index,
out_of_vocabulary=self.multilingual_index.l_index[lang].out_of_vocabulary)
data[lang] = indexed
lX = {}
ly = {}
batcher_transform = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
lpad=self.multilingual_index.l_pad())
# l_devel_index = self.multilingual_index.l_devel_index()
l_devel_target = self.multilingual_index.l_devel_target()
l_devel_target = {k: v[:len(data[k])] for k, v in l_devel_target.items()} # todo -> debug
for batch, _, target, lang, in batchify(l_index=data,
l_post=None,
llabels=l_devel_target,
batchsize=batch_size,
lpad=self.multilingual_index.l_pad()):
# for idx, (batch, post, bert_emb, target, lang) in enumerate(
# batcher_transform.batchify(l_devel_index, None, None, l_devel_target)):
# for idx, (batch, post, bert_emb, target, lang) in enumerate(
# batcher_transform.batchify(data, None, None, l_devel_target)):
if lang not in lX.keys():
lX[lang] = self.model.get_embeddings(batch, lang)
ly[lang] = target.cpu().detach().numpy()
else:
lX[lang] = np.concatenate((lX[lang], self.model.get_embeddings(batch, lang)), axis=0)
ly[lang] = np.concatenate((ly[lang], target.cpu().detach().numpy()), axis=0)
return lX
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
def _load_pretrained_embeddings(self, we_path, langs):
lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ?
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
return lpretrained, lpretrained_vocabulary
def _none_dict(self, langs):
return {l:None for l in langs}
# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested
def _init_Net(self, xavier_uniform=True):
model = RNNMultilingualClassifier(
output_size=self.nC,
hidden_size=self.hidden_size,
lvocab_size=self.multilingual_index.l_vocabsize(),
learnable_length=0,
lpretrained=self.multilingual_index.l_embeddings(),
drop_embedding_range=self.multilingual_index.sup_range,
drop_embedding_prop=self.sup_drop,
post_probabilities=self.posteriors
)
return model.cuda()
class DocEmbedderList:
def __init__(self, *embedder_list, aggregation='concat'):
assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"'
if len(embedder_list) == 0:
embedder_list = []
self.embedders = embedder_list
self.aggregation = aggregation
print(f'Aggregation mode: {self.aggregation}')
def fit(self, lX, ly, lV=None, tfidf=None):
for transformer in self.embedders:
_lX = lX
if transformer.requires_tfidf:
_lX = tfidf
transformer.fit(_lX, ly, lV)
return self
def transform(self, lX, tfidf=None):
if self.aggregation == 'concat':
return self.transform_concat(lX, tfidf)
elif self.aggregation == 'mean':
return self.transform_mean(lX, tfidf)
def transform_concat(self, lX, tfidf):
if len(self.embedders) == 1:
if self.embedders[0].requires_tfidf:
lX = tfidf
return self.embedders[0].transform(lX)
some_sparse = False
langs = sorted(lX.keys())
lZparts = {l: [] for l in langs}
for transformer in self.embedders:
_lX = lX
if transformer.requires_tfidf:
_lX = tfidf
lZ = transformer.transform(_lX)
for l in langs:
Z = lZ[l]
some_sparse = some_sparse or issparse(Z)
lZparts[l].append(Z)
hstacker = hstack if some_sparse else np.hstack
return {l: hstacker(lZparts[l]) for l in langs}
def transform_mean(self, lX, tfidf):
if len(self.embedders) == 1:
return self.embedders[0].transform(lX)
langs = sorted(lX.keys())
lZparts = {l: None for l in langs}
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
min_dim = 73 # TODO <---- this should be the number of target classes
for transformer in self.embedders:
_lX = lX
if transformer.requires_tfidf:
_lX = tfidf
lZ = transformer.transform(_lX)
nC = min([lZ[lang].shape[1] for lang in langs])
for l in langs:
Z = lZ[l]
if Z.shape[1] > min_dim:
print(
f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
f'Applying PCA(n_components={min_dim})')
pca = PCA(n_components=min_dim)
Z = pca.fit(Z).transform(Z)
if lZparts[l] is None:
lZparts[l] = Z
else:
lZparts[l] += Z
n_transformers = len(self.embedders)
return {l: lZparts[l] / n_transformers for l in langs}
def fit_transform(self, lX, ly, lV=None, tfidf=None):
return self.fit(lX, ly, lV, tfidf).transform(lX, tfidf)
def best_params(self):
return {'todo'}
def append(self, embedder):
self.embedders.append(embedder)
class FeatureSet2Posteriors:
def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1):
self.transformer = transformer
self.l2 = l2
self.n_jobs = n_jobs
self.prob_classifier = MetaClassifier(
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
self.requires_tfidf = requires_tfidf
def fit(self, lX, ly, lV=None):
if lV is None and hasattr(self.transformer, 'lV'):
lV = self.transformer.lV
lZ = self.transformer.fit_transform(lX, ly, lV)
self.prob_classifier.fit(lZ, ly)
return self
def transform(self, lX):
lP = self.predict_proba(lX)
lP = _normalize(lP, self.l2)
return lP
def fit_transform(self, lX, ly, lV):
return self.fit(lX, ly, lV).transform(lX)
def predict(self, lX, ly=None):
lZ = self.transformer.transform(lX)
return self.prob_classifier.predict(lZ)
def predict_proba(self, lX, ly=None):
lZ = self.transformer.transform(lX)
return self.prob_classifier.predict_proba(lZ)
# ------------------------------------------------------------------
# Meta-Classifier (aka second-tier learner)
# ------------------------------------------------------------------
class MetaClassifier:
def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None):
self.n_jobs = n_jobs
self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
self.standardize_range = standardize_range
def fit(self, lZ, ly):
tinit = time.time()
Z, y = self.stack(lZ, ly)
self.standardizer = StandardizeTransformer(range=self.standardize_range)
Z = self.standardizer.fit_transform(Z)
print('fitting the Z-space of shape={}'.format(Z.shape))
self.model.fit(Z, y)
self.time = time.time() - tinit
def stack(self, lZ, ly=None):
langs = list(lZ.keys())
Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
if ly is not None:
y = np.vstack([ly[lang] for lang in langs])
return Z, y
else:
return Z
def predict(self, lZ, ly=None):
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
def predict_proba(self, lZ, ly=None):
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs)
def best_params(self):
return self.model.best_params()
# ------------------------------------------------------------------
# Ensembling (aka Funnelling)
# ------------------------------------------------------------------
class Funnelling:
def __init__(self,
vectorizer: TfidfVectorizerMultilingual,
first_tier: DocEmbedderList,
meta: MetaClassifier):
self.vectorizer = vectorizer
self.first_tier = first_tier
self.meta = meta
self.n_jobs = meta.n_jobs
def fit(self, lX, ly):
tfidf_lX = self.vectorizer.fit_transform(lX, ly)
lV = self.vectorizer.vocabulary()
print('## Fitting first-tier learners!')
lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX)
print('## Fitting meta-learner!')
self.meta.fit(lZ, ly)
def predict(self, lX, ly=None):
tfidf_lX = self.vectorizer.transform(lX)
lZ = self.first_tier.transform(lX, tfidf=tfidf_lX)
ly_ = self.meta.predict(lZ)
return ly_
def best_params(self):
return {'1st-tier': self.first_tier.best_params(),
'meta': self.meta.best_params()}
class Voting:
def __init__(self, *prob_classifiers):
assert all([hasattr(p, 'predict_proba') for p in prob_classifiers]), 'not all classifiers are probabilistic'
self.prob_classifiers = prob_classifiers
def fit(self, lX, ly, lV=None):
for classifier in self.prob_classifiers:
classifier.fit(lX, ly, lV)
def predict(self, lX, ly=None):
lP = {l: [] for l in lX.keys()}
for classifier in self.prob_classifiers:
lPi = classifier.predict_proba(lX)
for l in lX.keys():
lP[l].append(lPi[l])
lP = {l: np.stack(Plist).mean(axis=0) for l, Plist in lP.items()}
ly = {l: P > 0.5 for l, P in lP.items()}
return ly
# ------------------------------------------------------------------------------
# HELPERS
# ------------------------------------------------------------------------------
def load_muse_embeddings(we_path, langs, n_jobs=-1):
MUSE = Parallel(n_jobs=n_jobs)(
delayed(FastTextMUSE)(we_path, lang) for lang in langs
)
return {l: MUSE[i] for i, l in enumerate(langs)}
def word_class_embedding_matrix(X, Y, max_label_space=300):
WCE = supervised_embeddings_tfidf(X, Y)
WCE = zscores(WCE, axis=0)
nC = Y.shape[1]
if nC > max_label_space:
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
f'Applying PCA(n_components={max_label_space})')
pca = PCA(n_components=max_label_space)
WCE = pca.fit(WCE).transform(WCE)
return WCE
def XdotM(X, M, sif):
E = X.dot(M)
if sif:
print("removing pc...")
E = remove_pc(E, npc=1)
return E
def _normalize(lX, l2=True):
return {l: normalize(X) for l, X in lX.items()} if l2 else lX
class BatchGRU:
def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
self.batchsize = batchsize
self.batches_per_epoch = batches_per_epoch
self.languages = languages
self.lpad=lpad
self.max_pad_length=max_pad_length
self.init_offset()
def init_offset(self):
self.offset = {lang: 0 for lang in self.languages}
def batchify(self, l_index, l_post, l_bert, llabels):
langs = self.languages
l_num_samples = {l:len(l_index[l]) for l in langs}
max_samples = max(l_num_samples.values())
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches:
n_batches = self.batches_per_epoch
for b in range(n_batches):
for lang in langs:
index, labels = l_index[lang], llabels[lang]
offset = self.offset[lang]
if offset >= l_num_samples[lang]:
offset = 0
limit = offset+self.batchsize
batch_slice = slice(offset, limit)
batch = index[batch_slice]
batch_labels = labels[batch_slice].toarray()
post = None
bert_emb = None
batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
batch = torch.LongTensor(batch).cuda()
target = torch.FloatTensor(batch_labels).cuda()
self.offset[lang] = limit
yield batch, post, bert_emb, target, lang
def pad(index_list, pad_index, max_pad_length=None):
pad_length = np.max([len(index) for index in index_list])
if max_pad_length is not None:
pad_length = min(pad_length, max_pad_length)
for i,indexes in enumerate(index_list):
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
return index_list
def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, optim, epoch, method_name, opt,
ltrain_posteriors=None, ltrain_bert=None, log_interval=10):
_dataset_path = opt.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
loss_history = []
model.train()
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
optim.zero_grad()
loss = criterion(model(batch, post, bert_emb, lang), target)
loss.backward()
clip_gradient(model)
optim.step()
loss_history.append(loss.item())
if idx % log_interval == 0:
interval_loss = np.mean(loss_history[-log_interval:])
print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, '
f'Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time.time() - tinit)
return mean_loss
def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix):
loss_history = []
model.eval()
langs = sorted(ltest_index.keys())
predictions = {l: [] for l in langs}
yte_stacked = {l: [] for l in langs}
batcher.init_offset()
for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte),
desc='evaluation: '):
logits = model(batch, post, bert_emb, lang)
loss = criterion(logits, target).item()
prediction = predict(logits)
predictions[lang].append(prediction)
yte_stacked[lang].append(target.detach().cpu().numpy())
loss_history.append(loss)
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l:np.vstack(predictions[l]) for l in langs}
l_eval = evaluate(ly, ly_)
metrics = []
for lang in langs:
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix == 'te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
mean_loss = np.mean(loss_history)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time.time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time.time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time.time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time.time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time.time() - tinit)
return Mf1
def clip_gradient(model, clip_value=1e-1):
params = list(filter(lambda p: p.grad is not None, model.parameters()))
for p in params:
p.grad.data.clamp_(-clip_value, clip_value)
def init_logfile_nn(method_name, opt):
logfile = CSVLog(opt.logfile_gru, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
logfile.set_default('dataset', opt.dataset)
logfile.set_default('run', opt.seed)
logfile.set_default('method', method_name)
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
f'and run {opt.seed} already calculated'
return logfile

View File

@ -1,144 +0,0 @@
import os
from dataset_builder import MultilingualDataset
from learning.transformers import *
from util.evaluation import *
from util.file import exists
from util.results import PolylingualClassificationResults
from util.common import *
from util.parser_options import *
if __name__ == '__main__':
(op, args) = parser.parse_args()
dataset = op.dataset
assert exists(dataset), 'Unable to find file '+str(dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \
'empty set of document embeddings is not allowed'
assert (op.gruWCE or op.gruMUSE) and op.gruViewGenerator, 'Initializing Gated Recurrent embedding layer without ' \
'explicit initialization of GRU View Generator'
l2 = op.l2
dataset_file = os.path.basename(dataset)
results = PolylingualClassificationResults('../log/' + op.output)
allprob = 'Prob' if op.allprob else ''
# renaming arguments to be printed on log
method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert,
op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob)
print(f'Method: gFun{method_name}\nDataset: {dataset_name}')
print('-'*50)
# set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect
standardize_range = slice(0, 0)
if op.zscore:
standardize_range = None
# load dataset
data = MultilingualDataset.load(dataset)
data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING
data.show_dimensions()
lXtr, lytr = data.training()
lXte, lyte = data.test()
# text preprocessing
tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
# feature weighting (for word embeddings average)
feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
# document embedding modules aka View Generators
doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat')
# init View Generators
if op.posteriors:
"""
View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means
of a set of SVM.
"""
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
kernel='linear',
C=op.set_c), l2=l2))
if op.supervised:
"""
View Generator (-W): generates document representation via Word-Class-Embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings.
"""
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
if op.allprob:
wce = FeatureSet2Posteriors(wce, requires_tfidf=True, l2=l2)
doc_embedder.append(wce)
if op.pretrained:
"""
View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
"""
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
if op.allprob:
muse = FeatureSet2Posteriors(muse, requires_tfidf=True, l2=l2)
doc_embedder.append(muse)
if op.gruViewGenerator:
"""
View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Such
document embeddings are then casted into vectors of posterior probabilities via a set of SVM.
NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob
"""
op.gru_path = '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' # TODO DEBUG
op.gru_path = None
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
options=op, model_path=op.gru_path, we_path=op.we_path)
doc_embedder.append(rnn_embedder)
if op.mbert:
"""
View generator (-B): generates document embedding via mBERT model.
"""
op.bert_path = '/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-rcv1-2_run0' # TODO DEBUG
mbert = MBertEmbedder(path_to_model=op.bert_path,
nC=data.num_categories())
if op.allprob:
mbert = FeatureSet2Posteriors(mbert, l2=l2)
doc_embedder.append(mbert)
# metaclassifier
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c),
meta_parameters=get_params(op.optimc), standardize_range=standardize_range)
# ensembling the modules
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
print('\n# Fitting Funnelling Architecture...')
tinit = time.time()
classifier.fit(lXtr, lytr)
time = time.time()-tinit
print('\n# Evaluating ...')
l_eval = evaluate_method(classifier, lXte, lyte)
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
results.add_row(method='MultiModal',
learner='SVM',
optimp=op.optimc,
sif=op.sif,
zscore=op.zscore,
l2=op.l2,
wescaler=op.feat_weight,
pca=op.max_labels_S,
id=method_name,
dataset=dataset_name,
time=time,
lang=lang,
macrof1=macrof1,
microf1=microf1,
macrok=macrok,
microk=microk,
notes='')
print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))

View File

@ -1,42 +0,0 @@
import torch.nn as nn
from torch.nn import functional as F
import torch
class CNN_pdr(nn.Module):
def __init__(self, output_size, out_channels, compositional_dim, vocab_size, emb_dim, embeddings=None, drop_embedding_range=None,
drop_embedding_prop=0, drop_prob=0.5):
super(CNN_pdr, self).__init__()
self.vocab_size = vocab_size
self.emb_dim = emb_dim
self.embeddings = torch.FloatTensor(embeddings)
self.embedding_layer = nn.Embedding(vocab_size, emb_dim, _weight=self.embeddings)
self.kernel_heights = kernel_heights=[3,5,7]
self.stride = 1
self.padding = 0
self.drop_embedding_range = drop_embedding_range
self.drop_embedding_prop = drop_embedding_prop
assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
self.nC = 73
self.conv1 = nn.Conv2d(1, compositional_dim, (self.kernel_heights[0], self.emb_dim), self.stride, self.padding)
self.dropout = nn.Dropout(drop_prob)
self.label = nn.Linear(len(kernel_heights) * out_channels, output_size)
self.fC = nn.Linear(compositional_dim + self.nC, self.nC)
def forward(self, x, svm_output):
x = torch.LongTensor(x)
svm_output = torch.FloatTensor(svm_output)
x = self.embedding_layer(x)
x = self.conv1(x.unsqueeze(1))
x = F.relu(x.squeeze(3))
x = F.max_pool1d(x, x.size()[2]).squeeze(2)
x = torch.cat((x, svm_output), 1)
x = F.sigmoid(self.fC(x))
return x #.detach().numpy()
# logits = self.label(x)
# return logits

View File

@ -3,25 +3,29 @@ import torch.nn as nn
from torch.nn import functional as F
def init_embeddings(pretrained, vocab_size, learnable_length, device='cuda'):
def init_embeddings(pretrained, vocab_size, learnable_length):
"""
Compute the embedding matrix
:param pretrained:
:param vocab_size:
:param learnable_length:
:return:
"""
pretrained_embeddings = None
pretrained_length = 0
if pretrained is not None:
pretrained_length = pretrained.shape[1]
assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size'
pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length)
# requires_grad=False sets the embedding layer as NOT trainable
pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False)
# pretrained_embeddings.to(device)
learnable_embeddings = None
if learnable_length > 0:
learnable_embeddings = nn.Embedding(vocab_size, learnable_length)
# learnable_embeddings.to(device)
embedding_length = learnable_length + pretrained_length
assert embedding_length > 0, '0-size embeddings'
return pretrained_embeddings, learnable_embeddings, embedding_length

View File

@ -1,9 +1,24 @@
import numpy as np
import time
from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from joblib import Parallel, delayed
from scipy.sparse import issparse
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from src.util.standardizer import StandardizeTransformer
def get_learner(calibrate=False, kernel='linear', C=1):
"""
instantiate scikit Support Vector Classifier
:param calibrate: boolean, whether to return posterior probabilities or not
:param kernel: string,kernel to be applied to the SVC
:param C: int or dict {'C': list of integer}, Regularization parameter
:return: Support Vector Classifier
"""
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False)
def _sort_if_sparse(X):
@ -13,7 +28,7 @@ def _sort_if_sparse(X):
def _joblib_transform_multiling(transformer, lX, n_jobs=-1):
if n_jobs == 1:
return {lang:transformer(lX[lang]) for lang in lX.keys()}
return {lang: transformer(lX[lang]) for lang in lX.keys()}
else:
langs = list(lX.keys())
transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs)
@ -25,11 +40,11 @@ class TrivialRejector:
self.cats = y.shape[1]
return self
def decision_function(self, X): return np.zeros((X.shape[0],self.cats))
def decision_function(self, X): return np.zeros((X.shape[0], self.cats))
def predict(self, X): return np.zeros((X.shape[0],self.cats))
def predict(self, X): return np.zeros((X.shape[0], self.cats))
def predict_proba(self, X): return np.zeros((X.shape[0],self.cats))
def predict_proba(self, X): return np.zeros((X.shape[0], self.cats))
def best_params(self): return {}
@ -38,6 +53,7 @@ class NaivePolylingualClassifier:
"""
Is a mere set of independet MonolingualClassifiers
"""
def __init__(self, base_learner, parameters=None, n_jobs=-1):
self.base_learner = base_learner
self.parameters = parameters
@ -58,10 +74,11 @@ class NaivePolylingualClassifier:
_sort_if_sparse(lX[lang])
models = Parallel(n_jobs=self.n_jobs)\
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]), ly[lang]) for
lang in langs)
self.model = {lang: models[i] for i, lang in enumerate(langs)}
self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
self.empty_categories = {lang: self.model[lang].empty_categories for lang in langs}
self.time = time.time() - tinit
return self
@ -72,9 +89,9 @@ class NaivePolylingualClassifier:
"""
assert self.model is not None, 'predict called before fit'
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
langs=list(lX.keys())
langs = list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs)
return {lang:scores[i] for i,lang in enumerate(langs)}
return {lang: scores[i] for i, lang in enumerate(langs)}
def predict_proba(self, lX):
"""
@ -83,9 +100,10 @@ class NaivePolylingualClassifier:
"""
assert self.model is not None, 'predict called before fit'
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
langs=list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs)
return {lang:scores[i] for i,lang in enumerate(langs)}
langs = list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(
delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs)
return {lang: scores[i] for i, lang in enumerate(langs)}
def predict(self, lX):
"""
@ -95,14 +113,14 @@ class NaivePolylingualClassifier:
assert self.model is not None, 'predict called before fit'
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict'
if self.n_jobs == 1:
return {lang:self.model[lang].transform(lX[lang]) for lang in lX.keys()}
return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()}
else:
langs = list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs)
return {lang: scores[i] for i, lang in enumerate(langs)}
def best_params(self):
return {l:model.best_params() for l,model in self.model.items()}
return {lang: model.best_params() for lang, model in self.model.items()}
class MonolingualClassifier:
@ -117,14 +135,13 @@ class MonolingualClassifier:
def fit(self, X, y):
if X.shape[0] == 0:
print('Warning: X has 0 elements, a trivial rejector will be created')
self.model = TrivialRejector().fit(X,y)
self.model = TrivialRejector().fit(X, y)
self.empty_categories = np.arange(y.shape[1])
return self
tinit = time.time()
_sort_if_sparse(X)
self.empty_categories = np.argwhere(np.sum(y, axis=0)==0).flatten()
self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten()
# multi-class format
if len(y.shape) == 2:
if self.parameters is not None:
@ -142,13 +159,12 @@ class MonolingualClassifier:
self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs,
error_score=0, verbose=10)
# print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}')
print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}')
self.model.fit(X, y)
if isinstance(self.model, GridSearchCV):
self.best_params_ = self.model.best_params_
print('best parameters: ', self.best_params_)
self.time=time.time()-tinit
self.time = time.time() - tinit
return self
def decision_function(self, X):
@ -168,4 +184,41 @@ class MonolingualClassifier:
return self.model.predict(X)
def best_params(self):
return self.best_params_
return self.best_params_
class MetaClassifier:
def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None):
self.n_jobs = n_jobs
self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
self.standardize_range = standardize_range
def fit(self, lZ, ly):
tinit = time.time()
Z, y = self.stack(lZ, ly)
self.standardizer = StandardizeTransformer(range=self.standardize_range)
Z = self.standardizer.fit_transform(Z)
print('fitting the Z-space of shape={}'.format(Z.shape))
self.model.fit(Z, y)
self.time = time.time() - tinit
def stack(self, lZ, ly=None):
langs = list(lZ.keys())
Z = np.vstack([lZ[lang] for lang in langs])
if ly is not None:
y = np.vstack([ly[lang] for lang in langs])
return Z, y
else:
return Z
def predict(self, lZ):
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
def predict_proba(self, lZ):
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs)

View File

@ -1,8 +1,6 @@
#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py
import torch
import torch.nn as nn
from torch.autograd import Variable
from models.helpers import *
from torch.autograd import Variable
class RNNMultilingualClassifier(nn.Module):

View File

@ -1,249 +0,0 @@
from copy import deepcopy
import torch
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig
from sklearn.model_selection import train_test_split
from util.evaluation import *
from time import time
def predict(logits, classification_type='multilabel'):
if classification_type == 'multilabel':
prediction = torch.sigmoid(logits) > 0.5
elif classification_type == 'singlelabel':
prediction = torch.argmax(logits, dim=1).view(-1, 1)
else:
print('unknown classification type')
return prediction.detach().cpu().numpy()
class TrainingDataset(Dataset):
"""
data: dict of lang specific tokenized data
labels: dict of lang specific targets
"""
def __init__(self, data, labels):
self.langs = data.keys()
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
for i, lang in enumerate(self.langs):
_data = data[lang]['input_ids']
_data = np.array(_data)
_labels = labels[lang]
_lang_value = np.full(len(_data), self.lang_ids[lang])
if i == 0:
self.data = _data
self.labels = _labels
self.lang_index = _lang_value
else:
self.data = np.vstack((self.data, _data))
self.labels = np.vstack((self.labels, _labels))
self.lang_index = np.concatenate((self.lang_index, _lang_value))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.data[idx]
y = self.labels[idx]
lang = self.lang_index[idx]
return x, torch.tensor(y, dtype=torch.float), lang
def get_lang_ids(self):
return self.lang_ids
def get_nclasses(self):
if hasattr(self, 'labels'):
return len(self.labels[0])
else:
print('Method called before init!')
class ExtractorDataset(Dataset):
"""
data: dict of lang specific tokenized data
labels: dict of lang specific targets
"""
def __init__(self, data):
self.langs = data.keys()
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
for i, lang in enumerate(self.langs):
_data = data[lang]['input_ids']
_data = np.array(_data)
_lang_value = np.full(len(_data), self.lang_ids[lang])
if i == 0:
self.data = _data
self.lang_index = _lang_value
else:
self.data = np.vstack((self.data, _data))
self.lang_index = np.concatenate((self.lang_index, _lang_value))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.data[idx]
lang = self.lang_index[idx]
return x, lang
def get_lang_ids(self):
return self.lang_ids
def get_model(n_out):
print('# Initializing model ...')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
return model
def init_optimizer(model, lr, weight_decay=0):
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)],
'weight_decay': weight_decay},
{'params': [p for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)],
'weight_decay': weight_decay}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
return optimizer
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
l_split_va = deepcopy(l_tokenized_tr)
l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
l_split_tr = deepcopy(l_tokenized_tr)
l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
for lang in l_tokenized_tr.keys():
val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[
lang] = \
train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size,
random_state=seed, shuffle=True)
return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
def do_tokenization(l_dataset, max_len=512, verbose=True):
if verbose:
print('# Starting Tokenization ...')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
langs = l_dataset.keys()
l_tokenized = {}
for lang in langs:
l_tokenized[lang] = tokenizer(l_dataset[lang],
truncation=True,
max_length=max_len,
padding='max_length')
return l_tokenized
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10):
# _dataset_path = opt.dataset.split('/')[-1].split('_')
# dataset_id = _dataset_path[0] + _dataset_path[-1]
dataset_id = 'TODO fix this!'
loss_history = []
model.train()
for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
optim.zero_grad()
out = model(batch.cuda())
logits = out[0]
loss = criterion(logits, target.cuda())
loss.backward()
# clip_gradient(model)
optim.step()
loss_history.append(loss.item())
if idx % log_interval == 0:
interval_loss = np.mean(loss_history[log_interval:])
print(
f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
return mean_loss
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix):
print('# Validating model ...')
loss_history = []
model.eval()
langs = lang_ids.keys()
id_2_lang = {v: k for k, v in lang_ids.items()}
predictions = {l: [] for l in langs}
yte_stacked = {l: [] for l in langs}
for batch, target, lang_idx in test_dataloader:
out = model(batch.cuda())
logits = out[0]
loss = criterion(logits, target.cuda()).item()
prediction = predict(logits)
loss_history.append(loss)
# Assigning prediction to dict in predictions and yte_stacked according to lang_idx
for i, pred in enumerate(prediction):
lang_pred = id_2_lang[lang_idx.numpy()[i]]
predictions[lang_pred].append(pred)
yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
ly = {l: np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l: np.vstack(predictions[l]) for l in langs}
l_eval = evaluate(ly, ly_)
metrics = []
for lang in langs:
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix == 'te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
mean_loss = np.mean(loss_history)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
return Mf1
def feature_extractor(data, lang_ids, model):
print('# Feature Extractor Mode...')
"""
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
"""
all_batch_embeddings = {}
id2lang = {v: k for k, v in lang_ids.items()}
with torch.no_grad():
for batch, lang_idx in data:
# for batch, target, lang_idx in data:
out = model(batch.cuda())
last_hidden_state = out[1][-1]
batch_embeddings = last_hidden_state[:, 0, :]
for i, l_idx in enumerate(lang_idx.numpy()):
if id2lang[l_idx] not in all_batch_embeddings.keys():
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
else:
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
batch_embeddings[i].detach().cpu().numpy()))
return all_batch_embeddings, id2lang

188
src/models/pl_bert.py Normal file
View File

@ -0,0 +1,188 @@
import pytorch_lightning as pl
import torch
from torch.optim.lr_scheduler import StepLR
from transformers import BertForSequenceClassification, AdamW
from src.util.common import define_pad_length, pad
from src.util.pl_metrics import CustomF1, CustomK
class BertModel(pl.LightningModule):
def __init__(self, output_size, stored_path, gpus=None):
"""
Init Bert model.
:param output_size:
:param stored_path:
:param gpus:
"""
super().__init__()
self.loss = torch.nn.BCEWithLogitsLoss()
self.gpus = gpus
self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
# Language specific metrics to compute metrics at epoch level
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
if stored_path:
self.bert = BertForSequenceClassification.from_pretrained(stored_path,
num_labels=output_size,
output_hidden_states=True)
else:
self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased',
num_labels=output_size,
output_hidden_states=True)
self.save_hyperparameters()
def forward(self, X):
logits = self.bert(X)
return logits
def training_step(self, train_batch, batch_idx):
X, y, _, batch_langs = train_batch
X = torch.cat(X).view([X[0].shape[0], len(X)])
y = y.type(torch.FloatTensor)
y = y.to('cuda' if self.gpus else 'cpu')
logits, _ = self.forward(X)
loss = self.loss(logits, y)
# Squashing logits through Sigmoid in order to get confidence score
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, y)
macroF1 = self.macroF1(predictions, y)
microK = self.microK(predictions, y)
macroK = self.macroK(predictions, y)
self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
lX, ly = self._reconstruct_dict(predictions, y, batch_langs)
return {'loss': loss, 'pred': lX, 'target': ly}
def training_epoch_end(self, outputs):
langs = []
for output in outputs:
langs.extend(list(output['pred'].keys()))
langs = set(langs)
# outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
# here we save epoch level metric values and compute them specifically for each language
res_macroF1 = {lang: [] for lang in langs}
res_microF1 = {lang: [] for lang in langs}
res_macroK = {lang: [] for lang in langs}
res_microK = {lang: [] for lang in langs}
for output in outputs:
lX, ly = output['pred'], output['target']
for lang in lX.keys():
X, y = lX[lang], ly[lang]
lang_macroF1 = self.lang_macroF1(X, y)
lang_microF1 = self.lang_microF1(X, y)
lang_macroK = self.lang_macroK(X, y)
lang_microK = self.lang_microK(X, y)
res_macroF1[lang].append(lang_macroF1)
res_microF1[lang].append(lang_microF1)
res_macroK[lang].append(lang_macroK)
res_microK[lang].append(lang_microK)
for lang in langs:
avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang]))
avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang]))
avg_macroK = torch.mean(torch.Tensor(res_macroK[lang]))
avg_microK = torch.mean(torch.Tensor(res_microK[lang]))
self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch)
def validation_step(self, val_batch, batch_idx):
X, y, _, batch_langs = val_batch
X = torch.cat(X).view([X[0].shape[0], len(X)])
y = y.type(torch.FloatTensor)
y = y.to('cuda' if self.gpus else 'cpu')
logits, _ = self.forward(X)
loss = self.loss(logits, y)
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, y)
macroF1 = self.macroF1(predictions, y)
microK = self.microK(predictions, y)
macroK = self.macroK(predictions, y)
self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return {'loss': loss}
def test_step(self, test_batch, batch_idx):
X, y, _, batch_langs = test_batch
X = torch.cat(X).view([X[0].shape[0], len(X)])
y = y.type(torch.FloatTensor)
y = y.to('cuda' if self.gpus else 'cpu')
logits, _ = self.forward(X)
loss = self.loss(logits, y)
# Squashing logits through Sigmoid in order to get confidence score
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, y)
macroF1 = self.macroF1(predictions, y)
microK = self.microK(predictions, y)
macroK = self.macroK(predictions, y)
self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return
def configure_optimizers(self, lr=3e-5, weight_decay=0.01):
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in self.bert.named_parameters()
if not any(nd in n for nd in no_decay)],
'weight_decay': weight_decay},
{'params': [p for n, p in self.bert.named_parameters()
if any(nd in n for nd in no_decay)],
'weight_decay': weight_decay}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = StepLR(optimizer, step_size=25, gamma=0.1)
return [optimizer], [scheduler]
def encode(self, lX, batch_size=64):
with torch.no_grad():
l_embed = {lang: [] for lang in lX.keys()}
for lang in sorted(lX.keys()):
for i in range(0, len(lX[lang]), batch_size):
if i + batch_size > len(lX[lang]):
batch = lX[lang][i:len(lX[lang])]
else:
batch = lX[lang][i:i + batch_size]
max_pad_len = define_pad_length(batch)
batch = pad(batch, pad_index=self.bert.config.pad_token_id, max_pad_length=max_pad_len)
batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu')
_, output = self.forward(batch)
# deleting batch from gpu to avoid cuda OOM
del batch
torch.cuda.empty_cache()
doc_embeds = output[-1][:, 0, :]
l_embed[lang].append(doc_embeds.cpu())
for k, v in l_embed.items():
l_embed[k] = torch.cat(v, dim=0).numpy()
return l_embed
@staticmethod
def _reconstruct_dict(predictions, y, batch_langs):
reconstructed_x = {lang: [] for lang in set(batch_langs)}
reconstructed_y = {lang: [] for lang in set(batch_langs)}
for i, pred in enumerate(predictions):
reconstructed_x[batch_langs[i]].append(pred)
reconstructed_y[batch_langs[i]].append(y[i])
for k, v in reconstructed_x.items():
reconstructed_x[k] = torch.cat(v).view(-1, predictions.shape[1])
for k, v in reconstructed_y.items():
reconstructed_y[k] = torch.cat(v).view(-1, predictions.shape[1])
return reconstructed_x, reconstructed_y

266
src/models/pl_gru.py Normal file
View File

@ -0,0 +1,266 @@
# Lightning modules, see https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from torch import nn
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR
from transformers import AdamW
from src.models.helpers import init_embeddings
from src.util.common import define_pad_length, pad
from src.util.pl_metrics import CustomF1, CustomK
class RecurrentModel(pl.LightningModule):
def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length,
drop_embedding_range, drop_embedding_prop, gpus=None):
"""
Init RNN model.
:param lPretrained:
:param langs:
:param output_size:
:param hidden_size:
:param lVocab_size:
:param learnable_length:
:param drop_embedding_range:
:param drop_embedding_prop:
:param gpus:
"""
super().__init__()
self.gpus = gpus
self.langs = langs
self.lVocab_size = lVocab_size
self.learnable_length = learnable_length
self.output_size = output_size
self.hidden_size = hidden_size
self.drop_embedding_range = drop_embedding_range
self.drop_embedding_prop = drop_embedding_prop
self.loss = torch.nn.BCEWithLogitsLoss()
self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
# Language specific metrics to compute metrics at epoch level
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.lPretrained_embeddings = nn.ModuleDict()
self.lLearnable_embeddings = nn.ModuleDict()
self.n_layers = 1
self.n_directions = 1
self.dropout = nn.Dropout(0.6)
lstm_out = 256
ff1 = 512
ff2 = 256
lpretrained_embeddings = {}
llearnable_embeddings = {}
for lang in self.langs:
pretrained = lPretrained[lang] if lPretrained else None
pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings(
pretrained, self.lVocab_size[lang], self.learnable_length)
lpretrained_embeddings[lang] = pretrained_embeddings
llearnable_embeddings[lang] = learnable_embeddings
self.embedding_length = embedding_length
self.lPretrained_embeddings.update(lpretrained_embeddings)
self.lLearnable_embeddings.update(llearnable_embeddings)
self.rnn = nn.GRU(self.embedding_length, hidden_size)
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
self.linear1 = nn.Linear(lstm_out, ff1)
self.linear2 = nn.Linear(ff1, ff2)
self.label = nn.Linear(ff2, self.output_size)
# TODO: setting lPretrained to None, letting it to its original value will "bug" first validation
# step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow)
lPretrained = None
self.save_hyperparameters()
def forward(self, lX):
l_embed = []
for lang in sorted(lX.keys()):
doc_embedding = self.transform(lX[lang], lang)
l_embed.append(doc_embedding)
embed = torch.cat(l_embed, dim=0)
logits = self.label(embed)
return logits
def transform(self, X, lang):
batch_size = X.shape[0]
X = self.embed(X, lang)
X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
training=self.training)
X = X.permute(1, 0, 2)
h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).to(self.device))
output, _ = self.rnn(X, h_0)
output = output[-1, :, :]
output = F.relu(self.linear0(output))
output = self.dropout(F.relu(self.linear1(output)))
output = self.dropout(F.relu(self.linear2(output)))
return output
def encode(self, lX, l_pad, batch_size=128):
"""
Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512.
:param lX:
:param l_pad:
:param batch_size:
:return:
"""
with torch.no_grad():
l_embed = {lang: [] for lang in lX.keys()}
for lang in sorted(lX.keys()):
for i in range(0, len(lX[lang]), batch_size):
if i+batch_size > len(lX[lang]):
batch = lX[lang][i:len(lX[lang])]
else:
batch = lX[lang][i:i+batch_size]
max_pad_len = define_pad_length(batch)
batch = pad(batch, pad_index=l_pad[lang], max_pad_length=max_pad_len)
X = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu')
_batch_size = X.shape[0]
X = self.embed(X, lang)
X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
training=self.training)
X = X.permute(1, 0, 2)
h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, _batch_size, self.hidden_size).to(self.device))
output, _ = self.rnn(X, h_0)
output = output[-1, :, :]
output = F.relu(self.linear0(output))
output = self.dropout(F.relu(self.linear1(output)))
l_embed[lang].append(output.cpu())
for k, v in l_embed.items():
l_embed[k] = torch.cat(v, dim=0).numpy()
return l_embed
def training_step(self, train_batch, batch_idx):
lX, ly = train_batch
logits = self.forward(lX)
_ly = []
for lang in sorted(lX.keys()):
_ly.append(ly[lang])
y = torch.cat(_ly, dim=0)
loss = self.loss(logits, y)
# Squashing logits through Sigmoid in order to get confidence score
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, y)
macroF1 = self.macroF1(predictions, y)
microK = self.microK(predictions, y)
macroK = self.macroK(predictions, y)
self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
re_lX = self._reconstruct_dict(predictions, ly)
return {'loss': loss, 'pred': re_lX, 'target': ly}
def training_epoch_end(self, outputs):
# outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
# here we save epoch level metric values and compute them specifically for each language
res_macroF1 = {lang: [] for lang in self.langs}
res_microF1 = {lang: [] for lang in self.langs}
res_macroK = {lang: [] for lang in self.langs}
res_microK = {lang: [] for lang in self.langs}
for output in outputs:
lX, ly = output['pred'], output['target']
for lang in lX.keys():
X, y = lX[lang], ly[lang]
lang_macroF1 = self.lang_macroF1(X, y)
lang_microF1 = self.lang_microF1(X, y)
lang_macroK = self.lang_macroK(X, y)
lang_microK = self.lang_microK(X, y)
res_macroF1[lang].append(lang_macroF1)
res_microF1[lang].append(lang_microF1)
res_macroK[lang].append(lang_macroK)
res_microK[lang].append(lang_microK)
for lang in self.langs:
avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang]))
avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang]))
avg_macroK = torch.mean(torch.Tensor(res_macroK[lang]))
avg_microK = torch.mean(torch.Tensor(res_microK[lang]))
self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch)
def validation_step(self, val_batch, batch_idx):
lX, ly = val_batch
logits = self.forward(lX)
_ly = []
for lang in sorted(lX.keys()):
_ly.append(ly[lang])
ly = torch.cat(_ly, dim=0)
loss = self.loss(logits, ly)
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, ly)
macroF1 = self.macroF1(predictions, ly)
microK = self.microK(predictions, ly)
macroK = self.macroK(predictions, ly)
self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return {'loss': loss}
def test_step(self, test_batch, batch_idx):
lX, ly = test_batch
logits = self.forward(lX)
_ly = []
for lang in sorted(lX.keys()):
_ly.append(ly[lang])
ly = torch.cat(_ly, dim=0)
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, ly)
macroF1 = self.macroF1(predictions, ly)
microK = self.microK(predictions, ly)
macroK = self.macroK(predictions, ly)
self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return
def embed(self, X, lang):
input_list = []
if self.lPretrained_embeddings[lang]:
input_list.append(self.lPretrained_embeddings[lang](X))
if self.lLearnable_embeddings[lang]:
input_list.append(self.lLearnable_embeddings[lang](X))
return torch.cat(tensors=input_list, dim=2)
def embedding_dropout(self, X, drop_range, p_drop=0.5, training=True):
if p_drop > 0 and training and drop_range is not None:
p = p_drop
drop_from, drop_to = drop_range
m = drop_to - drop_from # length of the supervised embedding
l = X.shape[2] # total embedding length
corr = (1 - p)
X[:, :, drop_from:drop_to] = corr * F.dropout(X[:, :, drop_from:drop_to], p=p)
X /= (1 - (p * m / l))
return X
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=1e-3)
scheduler = StepLR(optimizer, step_size=25, gamma=0.5)
return [optimizer], [scheduler]
@staticmethod
def _reconstruct_dict(X, ly):
reconstructed = {}
_start = 0
for lang in sorted(ly.keys()):
lang_batchsize = len(ly[lang])
reconstructed[lang] = X[_start:_start+lang_batchsize]
_start += lang_batchsize
return reconstructed

View File

@ -1,11 +0,0 @@
import pandas as pd
import numpy as np
# df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t')
df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t')
pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std])
with pd.option_context('display.max_rows', None):
print(pivot.round(3))
print('Finished ...')

View File

@ -1,6 +1,7 @@
import numpy as np
from sklearn.decomposition import TruncatedSVD
def get_weighted_average(We, x, w):
"""
Compute the weighted average vectors
@ -15,6 +16,7 @@ def get_weighted_average(We, x, w):
emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:])
return emb
def compute_pc(X,npc=1):
"""
Compute the principal components.
@ -26,6 +28,7 @@ def compute_pc(X,npc=1):
svd.fit(X)
return svd.components_
def remove_pc(X, npc=1):
"""
Remove the projection on the principal components
@ -34,7 +37,7 @@ def remove_pc(X, npc=1):
:return: XX[i, :] is the data point after removing its projection
"""
pc = compute_pc(X, npc)
if npc==1:
if npc == 1:
XX = X - X.dot(pc.transpose()) * pc
else:
XX = X - X.dot(pc.transpose()).dot(pc)

562
src/util/common.py Executable file → Normal file
View File

@ -1,12 +1,4 @@
import subprocess
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from embeddings.supervised import get_supervised_embeddings
# from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual
import numpy as np
from tqdm import tqdm
import torch
warnings.filterwarnings("ignore", category=DeprecationWarning)
@ -107,201 +99,101 @@ class Index:
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True
)
print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
from src.util.embeddings_manager import supervised_embeddings_tfidf
def get_word_list(self):
def extract_word_list(word2index):
return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])]
word_list = extract_word_list(self.word2index)
word_list += extract_word_list(self.out_of_vocabulary)
return word_list
class TfidfVectorizerMultilingual:
def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None):
print(f'[generating embedding matrix for lang {self.lang}]')
def __init__(self, **kwargs):
self.kwargs = kwargs
self.wce_range = None
embedding_parts = []
def fit(self, lX, ly=None):
self.langs = sorted(lX.keys())
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
return self
if pretrained is not None:
print('\t[pretrained-matrix]')
word_list = self.get_word_list()
muse_embeddings = pretrained.extract(word_list)
embedding_parts.append(muse_embeddings)
del pretrained
def transform(self, lX):
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
if supervised:
print('\t[supervised-matrix]')
F = get_supervised_embeddings(Xtr, Ytr, reduction=None, method='dotn')
num_missing_rows = self.vocabsize - F.shape[0]
F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1]))))
F = torch.from_numpy(F).float()
def fit_transform(self, lX, ly=None):
return self.fit(lX, ly).transform(lX)
offset = 0
if embedding_parts:
offset = embedding_parts[0].shape[1]
self.wce_range = [offset, offset + F.shape[1]]
def vocabulary(self, l=None):
if l is None:
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
else:
return self.vectorizer[l].vocabulary_
embedding_parts.append(F)
def get_analyzer(self, l=None):
if l is None:
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
else:
return self.vectorizer[l].build_analyzer()
make_dumps = False
if make_dumps:
print(f'Dumping Embedding Matrices ...')
import pickle
with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile:
pickle.dump((self.lang, embedding_parts, self.word2index), outfile)
with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2:
pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2)
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
def _normalize(lX, l2=True):
return {lang: normalize(X) for lang, X in lX.items()} if l2 else lX
def none_dict(langs):
return {l:None for l in langs}
return {l: None for l in langs}
class MultilingualIndex:
def __init__(self): #, add_language_trace=False):
def __init__(self):
"""
Class that contains monolingual Indexes
"""
self.l_index = {}
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
# self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000)
# self.add_language_trace=add_language_trace}
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_test_target, l_pretrained_vocabulary=None):
self.langs = sorted(l_devel_raw.keys())
#build the vocabularies
self.l_vectorizer.fit(l_devel_raw)
l_vocabulary = self.l_vectorizer.vocabulary()
l_analyzer = self.l_vectorizer.get_analyzer()
if l_pretrained_vocabulary is None:
l_pretrained_vocabulary = none_dict(self.langs)
for l in self.langs:
self.l_index[l] = Index(l_devel_raw[l], l_devel_target[l], l_test_raw[l], l)
self.l_index[l].index(l_pretrained_vocabulary[l], l_analyzer[l], l_vocabulary[l])
for lang in self.langs:
# Init monolingual Index
self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang],
lang)
# call to index() function of monolingual Index
self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang])
def train_val_split(self, val_prop=0.2, max_val=2000, seed=42):
for l,index in self.l_index.items():
for l, index in self.l_index.items():
index.train_val_split(val_prop, max_val, seed=seed)
def embedding_matrices(self, lpretrained, supervised):
"""
Extract from pretrained embeddings words that are found in the training dataset, then for each language
calls the respective monolingual index and build the embedding matrix (if supervised, WCE are concatenated
to the unsupervised vectors).
:param lpretrained: dict {lang : matrix of word-embeddings }
:param supervised: bool, whether to deploy Word-Class Embeddings or not
:return: self
"""
lXtr = self.get_lXtr() if supervised else none_dict(self.langs)
lYtr = self.l_train_target() if supervised else none_dict(self.langs)
for l,index in self.l_index.items():
index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
lWordList = self.get_wordlist()
lExtracted = lpretrained.extract(lWordList)
for lang, index in self.l_index.items():
# if supervised concatenate embedding matrices of pretrained unsupervised
# and supervised word-class embeddings
index.compose_embedding_matrix(lExtracted[lang], supervised, lXtr[lang], lYtr[lang])
self.sup_range = index.wce_range
return self
# TODO circular import with transformers --> when generating posterior prob, we import PosteriorProbabilitiesEmbedder which is defined in transformers
# def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
# # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
# timeit = time.time()
# lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
# lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
# if not stored_post:
# for l in self.langs:
# n_elements = lXtr[l].shape[0]
# if n_elements > max_training_docs_by_lang:
# choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
# lXtr[l] = lXtr[l][choice]
# lYtr[l] = lYtr[l][choice]
#
# # train the posterior probabilities embedder
# print('[posteriors] training a calibrated SVM')
# learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
# prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
# prob_embedder.fit(lXtr, lYtr)
#
# # transforms the training, validation, and test sets into posterior probabilities
# print('[posteriors] generating posterior probabilities')
# lPtr = prob_embedder.transform(self.get_lXtr())
# lPva = prob_embedder.transform(self.get_lXva())
# lPte = prob_embedder.transform(self.get_lXte())
# # NB: Check splits indices !
# if store_posteriors:
# import pickle
# with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
# pickle.dump([lPtr, lPva, lPte], outfile)
# print(f'Successfully dumped posteriors!')
# else:
# import pickle
# with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
# lPtr, lPva, lPte = pickle.load(infile)
# print(f'Successfully loaded stored posteriors!')
# print(f'[posteriors] done in {time.time() - timeit}')
# return lPtr, lPva, lPte
def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False):
show_gpu('GPU memory before initializing mBert model:')
# TODO: load dumped embeddings?
from experiment_scripts.main_mbert_extractor import do_tokenization, ExtractorDataset, DataLoader
from transformers import BertConfig, BertForSequenceClassification
print('[mBERT] generating mBERT doc embeddings')
lXtr_raw = self.get_raw_lXtr()
lXva_raw = self.get_raw_lXva()
lXte_raw = self.get_raw_lXte()
print('# Tokenizing datasets')
l_tokenized_tr = do_tokenization(lXtr_raw, max_len=max_len, verbose=False)
tr_dataset = ExtractorDataset(l_tokenized_tr)
tr_lang_ids = tr_dataset.lang_ids
tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=False)
l_tokenized_va = do_tokenization(lXva_raw, max_len=max_len, verbose=False)
va_dataset = ExtractorDataset(l_tokenized_va)
va_lang_ids = va_dataset.lang_ids
va_dataloader = DataLoader(va_dataset, batch_size=batch_size, shuffle=False)
l_tokenized_te = do_tokenization(lXte_raw, max_len=max_len, verbose=False)
te_dataset = ExtractorDataset(l_tokenized_te)
te_lang_ids = te_dataset.lang_ids
te_dataloader = DataLoader(te_dataset, batch_size=batch_size, shuffle=False)
num_labels = self.l_index[self.langs[0]].val_target.shape[1]
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
num_labels=num_labels)
model = BertForSequenceClassification.from_pretrained(bert_path,
config=config).cuda()
print('# Extracting document embeddings')
tr_bert_embeddings, id2lang_tr = self.do_bert_embeddings(model, tr_dataloader, tr_lang_ids, verbose=False)
va_bert_embeddings, id2lang_va = self.do_bert_embeddings(model, va_dataloader, va_lang_ids, verbose=False)
te_bert_embeddings, id2lang_te = self.do_bert_embeddings(model, te_dataloader, te_lang_ids, verbose=False)
show_gpu('GPU memory before after mBert model:')
# Freeing GPU's memory
import gc
del model, tr_dataloader, va_dataloader, te_dataloader
gc.collect()
torch.cuda.empty_cache()
show_gpu('GPU memory after clearing cache:')
return tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings
@staticmethod
def do_bert_embeddings(model, data, lang_ids, verbose=True):
if verbose:
print('# Feature Extractor Mode...')
all_batch_embeddings = {}
id2lang = {v: k for k, v in lang_ids.items()}
with torch.no_grad():
for batch, lang_idx in data:
out = model(batch.cuda())
last_hidden_state = out[1][-1]
batch_embeddings = last_hidden_state[:, 0, :]
for i, l_idx in enumerate(lang_idx.numpy()):
if id2lang[l_idx] not in all_batch_embeddings.keys():
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
else:
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
batch_embeddings[i].detach().cpu().numpy()))
return all_batch_embeddings, id2lang
def get_wordlist(self):
wordlist = {}
for lang, index in self.l_index.items():
wordlist[lang] = index.get_word_list()
return wordlist
def get_raw_lXtr(self):
lXtr_raw = {k:[] for k in self.langs}
lXtr_raw = {k: [] for k in self.langs}
lYtr_raw = {k: [] for k in self.langs}
for lang in self.langs:
lXtr_raw[lang] = self.l_index[lang].train_raw
@ -337,11 +229,14 @@ class MultilingualIndex:
self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()})
return self.lXte
def get_target_dim(self):
return self.l_index[self.langs[0]].devel_target.shape[1]
def l_vocabsize(self):
return {l:index.vocabsize for l,index in self.l_index.items()}
return {l: index.vocabsize for l, index in self.l_index.items()}
def l_embeddings(self):
return {l:index.embedding_matrix for l,index in self.l_index.items()}
return {l: index.embedding_matrix for l, index in self.l_index.items()}
def l_pad(self):
return {l: index.pad_index for l, index in self.l_index.items()}
@ -349,15 +244,30 @@ class MultilingualIndex:
def l_train_index(self):
return {l: index.train_index for l, index in self.l_index.items()}
def l_train_raw_index(self):
return {l: index.train_raw for l, index in self.l_index.items()}
def l_train_target(self):
return {l: index.train_target for l, index in self.l_index.items()}
def l_val_index(self):
return {l: index.val_index for l, index in self.l_index.items()}
def l_val_raw_index(self):
return {l: index.val_raw for l, index in self.l_index.items()}
def l_test_raw_index(self):
return {l: index.test_raw for l, index in self.l_index.items()}
def l_devel_raw_index(self):
return {l: index.devel_raw for l, index in self.l_index.items()}
def l_val_target(self):
return {l: index.val_target for l, index in self.l_index.items()}
def l_test_target(self):
return {l: index.test_target for l, index in self.l_index.items()}
def l_test_index(self):
return {l: index.test_index for l, index in self.l_index.items()}
@ -373,161 +283,179 @@ class MultilingualIndex:
def l_val(self):
return self.l_val_index(), self.l_val_target()
def l_test(self):
return self.l_test_index(), self.l_test_target()
class Batch:
def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
self.batchsize = batchsize
self.batches_per_epoch = batches_per_epoch
self.languages = languages
self.lpad=lpad
self.max_pad_length=max_pad_length
self.init_offset()
def l_train_raw(self):
return self.l_train_raw_index(), self.l_train_target()
def init_offset(self):
self.offset = {lang: 0 for lang in self.languages}
def l_val_raw(self):
return self.l_val_raw_index(), self.l_val_target()
def batchify(self, l_index, l_post, l_bert, llabels): # TODO: add bert embedding here...
langs = self.languages
l_num_samples = {l:len(l_index[l]) for l in langs}
def l_test_raw(self):
return self.l_test_raw_index(), self.l_test_target()
max_samples = max(l_num_samples.values())
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches:
n_batches = self.batches_per_epoch
def l_devel_raw(self):
return self.l_devel_raw_index(), self.l_devel_target()
for b in range(n_batches):
for lang in langs:
index, labels = l_index[lang], llabels[lang]
offset = self.offset[lang]
if offset >= l_num_samples[lang]:
offset = 0
limit = offset+self.batchsize
batch_slice = slice(offset, limit)
batch = index[batch_slice]
batch_labels = labels[batch_slice].toarray()
post = None
if l_post is not None:
post = torch.FloatTensor(l_post[lang][batch_slice]).cuda()
bert_emb = None
if l_bert is not None:
bert_emb = torch.FloatTensor(l_bert[lang][batch_slice]).cuda()
batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
batch = torch.LongTensor(batch).cuda()
target = torch.FloatTensor(batch_labels).cuda()
self.offset[lang] = limit
yield batch, post, bert_emb, target, lang
def get_l_pad_index(self):
return {l: index.get_pad_index() for l, index in self.l_index.items()}
def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500):
langs = sorted(l_index.keys())
nsamples = max([len(l_index[l]) for l in langs])
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
for b in range(nbatches):
for lang in langs:
index, labels = l_index[lang], llabels[lang]
class Index:
def __init__(self, devel_raw, devel_target, test_raw, test_target, lang):
"""
Monolingual Index, takes care of tokenizing raw data, converting strings to ids, splitting the data into
training and validation.
:param devel_raw: list of strings, list of raw training texts
:param devel_target:
:param test_raw: list of strings, list of raw test texts
:param lang: list, list of languages contained in the dataset
"""
self.lang = lang
self.devel_raw = devel_raw
self.devel_target = devel_target
self.test_raw = test_raw
self.test_target = test_target
if b * batchsize >= len(index):
continue
batch = index[b*batchsize:(b+1)*batchsize]
batch_labels = labels[b*batchsize:(b+1)*batchsize].toarray()
post = None
if l_post is not None:
post = torch.FloatTensor(l_post[lang][b*batchsize:(b+1)*batchsize]).cuda()
batch = pad(batch, pad_index=lpad[lang], max_pad_length=max_pad_length)
batch = torch.LongTensor(batch)
target = torch.FloatTensor(batch_labels)
yield batch.cuda(), post, target.cuda(), lang
def index(self, pretrained_vocabulary, analyzer, vocabulary):
self.word2index = dict(vocabulary)
known_words = set(self.word2index.keys())
if pretrained_vocabulary is not None:
known_words.update(pretrained_vocabulary)
self.word2index['UNKTOKEN'] = len(self.word2index)
self.word2index['PADTOKEN'] = len(self.word2index)
self.unk_index = self.word2index['UNKTOKEN']
self.pad_index = self.word2index['PADTOKEN']
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
self.out_of_vocabulary = dict()
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index,
self.out_of_vocabulary)
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index,
self.out_of_vocabulary)
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
def get_pad_index(self):
return self.pad_index
def train_val_split(self, val_prop, max_val, seed):
devel = self.devel_index
target = self.devel_target
devel_raw = self.devel_raw
val_size = int(min(len(devel) * val_prop, max_val))
self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
train_test_split(
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True)
print(
f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
def get_word_list(self):
def extract_word_list(word2index):
return [w for w, i in sorted(word2index.items(), key=lambda x: x[1])]
word_list = extract_word_list(self.word2index)
word_list += extract_word_list(self.out_of_vocabulary)
return word_list
def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None):
print(f'[generating embedding matrix for lang {self.lang}]')
self.wce_range = None
embedding_parts = []
if pretrained is not None:
print('\t[pretrained-matrix]')
embedding_parts.append(pretrained)
del pretrained
if supervised:
print('\t[supervised-matrix]')
F = supervised_embeddings_tfidf(Xtr, Ytr)
num_missing_rows = self.vocabsize - F.shape[0]
F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1]))))
F = torch.from_numpy(F).float()
offset = 0
if embedding_parts:
offset = embedding_parts[0].shape[1]
self.wce_range = [offset, offset + F.shape[1]]
embedding_parts.append(F)
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
def batchify_unlabelled(index_list, batchsize, pad_index, max_pad_length=500):
nsamples = len(index_list)
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
for b in range(nbatches):
batch = index_list[b*batchsize:(b+1)*batchsize]
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
batch = torch.LongTensor(batch)
yield batch.cuda()
def clip_gradient(model, clip_value=1e-1):
params = list(filter(lambda p: p.grad is not None, model.parameters()))
for p in params:
p.grad.data.clamp_(-clip_value, clip_value)
def predict(logits, classification_type='multilabel'):
if classification_type == 'multilabel':
prediction = torch.sigmoid(logits) > 0.5
elif classification_type == 'singlelabel':
prediction = torch.argmax(logits, dim=1).view(-1, 1)
else:
print('unknown classification type')
return prediction.detach().cpu().numpy()
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def show_gpu(msg):
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
"""
ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4
Index (i.e., replaces word strings with numerical indexes) a list of string documents
:param data: list of string documents
:param vocab: a fixed mapping [str]->[int] of words to indexes
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
because they are anyway contained in a pre-trained embedding set that we know in advance)
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
are not in the original vocab but that are in the known_words
:return:
"""
def query(field):
return (subprocess.check_output(
['nvidia-smi', f'--query-gpu={field}',
'--format=csv,nounits,noheader'],
encoding='utf-8'))
def to_int(result):
return int(result.strip().split('\n')[0])
used = to_int(query('memory.used'))
total = to_int(query('memory.total'))
pct = used / total
print('\n' + msg, f'{100 * pct:2.1f}% ({used} out of {total})')
indexes = []
vocabsize = len(vocab)
unk_count = 0
knw_count = 0
out_count = 0
# pbar = tqdm(data, desc=f'indexing')
for text in data:
words = analyzer(text)
index = []
for word in words:
if word in vocab:
idx = vocab[word]
else:
if word in known_words:
if word not in out_of_vocabulary:
out_of_vocabulary[word] = vocabsize + len(out_of_vocabulary)
idx = out_of_vocabulary[word]
out_count += 1
else:
idx = unk_index
unk_count += 1
index.append(idx)
indexes.append(index)
knw_count += len(index)
# pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
# f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
return indexes
class TfidfVectorizerMultilingual:
def __init__(self, **kwargs):
self.kwargs = kwargs
def fit(self, lX, ly=None):
self.langs = sorted(lX.keys())
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
return self
def transform(self, lX):
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
def fit_transform(self, lX, ly=None):
return self.fit(lX, ly).transform(lX)
def vocabulary(self, l=None):
if l is None:
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
else:
return self.vectorizer[l].vocabulary_
def get_analyzer(self, l=None):
if l is None:
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
else:
return self.vectorizer[l].build_analyzer()
def is_true(tensor, device):
return torch.where(tensor == 1, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device))
def get_learner(calibrate=False, kernel='linear', C=1):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False)
def is_false(tensor, device):
return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device))
def define_pad_length(index_list):
lengths = [len(index) for index in index_list]
return int(np.mean(lengths) + np.std(lengths))
def pad(index_list, pad_index, max_pad_length=None):
pad_length = np.max([len(index) for index in index_list])
if max_pad_length is not None:
pad_length = min(pad_length, max_pad_length)
for i, indexes in enumerate(index_list):
index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length]
return index_list
def get_params(optimc=False):
@ -538,20 +466,14 @@ def get_params(optimc=False):
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru,
gruMUSE, gruWCE, agg, allprob):
_id = '-'
_id_conf = [posteriors, supervised, pretrained, mbert, gru]
def get_method_name(args):
_id = ''
_id_conf = [args.post_embedder, args.wce_embedder, args.muse_embedder, args.bert_embedder, args.gru_embedder]
_id_name = ['X', 'W', 'M', 'B', 'G']
for i, conf in enumerate(_id_conf):
if conf:
_id += _id_name[i]
_id = _id if not gruMUSE else _id + '_muse'
_id = _id if not gruWCE else _id + '_wce'
_id = _id if not agg else _id + '_mean'
_id = _id if not allprob else _id + '_allprob'
_dataset_path = dataset.split('/')[-1].split('_')
_id = _id if not args.rnn_wce else _id + '_wce'
_dataset_path = args.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
return _id, dataset_id

View File

@ -1,60 +0,0 @@
import os
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
class CSVLog:
def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False):
self.file = file
self.autoflush = autoflush
self.verbose = verbose
if os.path.exists(file) and not overwrite:
self.tell('Loading existing file from {}'.format(file))
self.df = pd.read_csv(file, sep='\t')
self.columns = sorted(self.df.columns.values.tolist())
else:
self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file))
assert columns is not None, 'columns cannot be None'
self.columns = sorted(columns)
dir = os.path.dirname(self.file)
if dir and not os.path.exists(dir): os.makedirs(dir)
self.df = pd.DataFrame(columns=self.columns)
self.defaults={}
def already_calculated(self, **kwargs):
df = self.df
if df.shape[0]==0:
return False
if len(kwargs)==0:
kwargs = self.defaults
for key,val in kwargs.items():
df = df.loc[df[key]==val]
if df.shape[0]==0: return False
return True
def set_default(self, param, value):
self.defaults[param]=value
def add_row(self, **kwargs):
for key in self.defaults.keys():
if key not in kwargs:
kwargs[key]=self.defaults[key]
colums = sorted(list(kwargs.keys()))
values = [kwargs[col_i] for col_i in colums]
s = pd.Series(values, index=self.columns)
self.df = self.df.append(s, ignore_index=True)
if self.autoflush: self.flush()
# self.tell(s.to_string())
self.tell(kwargs)
def flush(self):
self.df.to_csv(self.file, index=False, sep='\t')
def tell(self, msg):
if self.verbose: print(msg)

View File

@ -1,50 +0,0 @@
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
def run_pca(dim, X):
"""
:param dim: number of pca components to keep
:param X: dictionary str(lang): matrix
:return: dict lang: reduced matrix
"""
r = dict()
pca = PCA(n_components=dim)
for lang in X.keys():
r[lang] = pca.fit_transform(X[lang])
return r
def get_optimal_dim(X, embed_type):
"""
:param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised
:param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT)
:return:
"""
_idx = []
plt.figure(figsize=(15, 10))
if embed_type == 'U':
plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance')
else:
plt.title(f'WCE Explained Variance')
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)')
for lang in X.keys():
pca = PCA(n_components=X[lang].shape[1])
pca.fit(X[lang])
_r = pca.explained_variance_ratio_
_r = np.cumsum(_r)
plt.plot(_r, label=lang)
for i in range(len(_r) - 1, 1, -1):
delta = _r[i] - _r[i - 1]
if delta > 0:
_idx.append(i)
break
best_n = max(_idx)
plt.axvline(best_n, color='r', label='optimal N')
plt.legend()
plt.show()
return best_n

View File

@ -1,71 +0,0 @@
#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
import torch
from transformers import BertForSequenceClassification
from time import time
from util.file import create_if_not_exist
import warnings
class EarlyStopping:
def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt', is_bert=False):
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
self.patience_limit = patience
self.patience = patience
self.verbose = verbose
self.best_score = None
self.best_epoch = None
self.stop_time = None
self.checkpoint = checkpoint
self.model = model
self.optimizer = optimizer
self.STOP = False
self.is_bert = is_bert
def __call__(self, watch_score, epoch):
if self.STOP:
return
if self.best_score is None or watch_score >= self.best_score:
self.best_score = watch_score
self.best_epoch = epoch
self.stop_time = time()
if self.checkpoint:
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
if self.is_bert:
print(f'Serializing Huggingface model...')
create_if_not_exist(self.checkpoint)
self.model.save_pretrained(self.checkpoint)
else:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
torch.save(self.model, self.checkpoint)
# with open(self.checkpoint)
# torch.save({'state_dict': self.model.state_dict(),
# 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint)
else:
self.print(f'[early-stop] improved')
self.patience = self.patience_limit
else:
self.patience -= 1
if self.patience == 0:
self.STOP = True
self.print(f'[early-stop] patience exhausted')
else:
if self.patience>0: # if negative, then early-stop is ignored
self.print(f'[early-stop] patience={self.patience}')
def reinit_counter(self):
self.STOP = False
self.patience=self.patience_limit
def restore_checkpoint(self):
print(f'restoring best model from epoch {self.best_epoch}...')
if self.is_bert:
return BertForSequenceClassification.from_pretrained(self.checkpoint)
else:
return torch.load(self.checkpoint)
def print(self, msg):
if self.verbose:
print(msg)

View File

@ -0,0 +1,104 @@
from abc import ABC, abstractmethod
import numpy as np
import torch
from torchtext.vocab import Vectors
from src.util.SIF_embed import remove_pc
class PretrainedEmbeddings(ABC):
def __init__(self):
super().__init__()
@abstractmethod
def vocabulary(self): pass
@abstractmethod
def dim(self): pass
@classmethod
def reindex(cls, words, word2index):
if isinstance(words, dict):
words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0]
source_idx, target_idx = [], []
for i, word in enumerate(words):
if word not in word2index:
continue
j = word2index[word]
source_idx.append(i)
target_idx.append(j)
source_idx = np.asarray(source_idx)
target_idx = np.asarray(target_idx)
return source_idx, target_idx
class MuseLoader:
def __init__(self, langs, cache):
self.langs = langs
self.lEmbed = {}
self.lExtracted = {}
for lang in self.langs:
print(f'Loading vectors for {lang}...')
self.lEmbed[lang] = Vectors(f'wiki.multi.{lang}.vec', cache)
def dim(self):
return self.lEmbed[list(self.lEmbed.keys())[0]].dim
def vocabulary(self):
return {lang: set(self.lEmbed[lang].stoi.keys()) for lang in self.langs}
def extract(self, lVoc):
"""
Reindex pretrained loaded embedding in order to match indexes assigned by scikit vectorizer. Such indexes
are consistent with those used by Word Class Embeddings (since we deploy the same vectorizer)
:param lVoc: dict {lang : {word : id}}
:return: torch embedding matrix of extracted embeddings i.e., words in lVoc
"""
for lang, words in lVoc.items():
print(f'Extracting words for lang {lang}...')
# words = list(zip(*sorted(lVoc[lang].items(), key=lambda x: x[1])))[0]
source_id, target_id = PretrainedEmbeddings.reindex(words, self.lEmbed[lang].stoi)
extraction = torch.zeros((len(words), self.dim()))
extraction[source_id] = self.lEmbed[lang].vectors[target_id]
self.lExtracted[lang] = extraction
return self.lExtracted
def get_lEmbeddings(self):
return {lang: self.lEmbed[lang].vectors for lang in self.langs}
def XdotM(X, M, sif):
E = X.dot(M)
if sif:
E = remove_pc(E, npc=1)
return E
def wce_matrix(X, Y):
wce = supervised_embeddings_tfidf(X, Y)
wce = zscores(wce, axis=0)
return wce
def supervised_embeddings_tfidf(X, Y):
tfidf_norm = X.sum(axis=0)
tfidf_norm[tfidf_norm == 0] = 1
F = (X.T).dot(Y) / tfidf_norm.T
return F
def zscores(X, axis=0):
"""
scipy.stats.zscores does not avoid division by 0, which can indeed occur
:param X:
:param axis:
:return:
"""
std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None)
mean = np.mean(X, axis=axis)
return (X - mean) / std

View File

@ -1,102 +1,19 @@
# from sklearn.externals.joblib import Parallel, delayed
from joblib import Parallel, delayed
from util.metrics import *
from sklearn.metrics import f1_score
import numpy as np
import time
from src.util.metrics import *
def evaluation_metrics(y, y_):
if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label
raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers
return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
def soft_evaluation_metrics(y, y_):
if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_)
def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
print('evaluation (n_jobs={})'.format(n_jobs))
if n_jobs == 1:
return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()}
else:
langs = list(ly_true.keys())
evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs)
return {lang: evals[i] for i, lang in enumerate(langs)}
def average_results(l_eval, show=True):
metrics = []
for lang in l_eval.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if show:
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
ave = np.mean(np.array(metrics), axis=0)
if show:
print('Averages: MF1, mF1, MK, mK', ave)
return ave
def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, return_time=False):
tinit = time.time()
print('prediction for test')
assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
n_jobs = polylingual_method.n_jobs if hasattr(polylingual_method, 'n_jobs') else -1
if predictor is None:
predictor = polylingual_method.predict
metrics = evaluation_metrics
if soft is True:
metrics = soft_evaluation_metrics
ly_ = predictor(lX, ly)
eval_ = evaluate(ly, ly_, metrics=metrics, n_jobs=n_jobs)
if return_time:
return eval_, time.time()-tinit
else:
return eval_
def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False):
print('prediction for test in a single language')
if predictor is None:
predictor = polylingual_method.predict
metrics = evaluation_metrics
if soft is True:
metrics = soft_evaluation_metrics
ly_ = predictor({lang:X})
return metrics(y, ly_[lang])
def get_binary_counters(polylingual_method, lX, ly, predictor=None):
print('prediction for test')
assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
n_jobs = polylingual_method.n_jobs
if predictor is None:
predictor = polylingual_method.predict
ly_ = predictor(lX)
print('evaluation (n_jobs={})'.format(n_jobs))
if n_jobs == 1:
return {lang: binary_counters(ly[lang], ly_[lang]) for lang in ly.keys()}
else:
langs = list(ly.keys())
evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs)
return {lang: evals[i] for i, lang in enumerate(langs)}
def binary_counters(y, y_):
y = np.reshape(y, (-1))
assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected'
counters = hard_single_metric_statistics(y, y_)
return counters.tp, counters.tn, counters.fp, counters.fn

View File

@ -1,7 +1,6 @@
import urllib
from os import listdir, makedirs
from os.path import isdir, isfile, join, exists, dirname
#from sklearn.externals.six.moves import urllib
import urllib
from pathlib import Path
@ -14,6 +13,7 @@ def download_file(url, archive_filename):
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
print("")
def download_file_if_not_exists(url, archive_path):
if exists(archive_path): return
makedirs_if_not_exist(dirname(archive_path))
@ -25,20 +25,26 @@ def ls(dir, typecheck):
el.sort()
return el
def list_dirs(dir):
return ls(dir, typecheck=isdir)
def list_files(dir):
return ls(dir, typecheck=isfile)
def makedirs_if_not_exist(path):
if not exists(path): makedirs(path)
def create_if_not_exist(path):
if not exists(path): makedirs(path)
def get_parent_name(path):
return Path(path).parent
def get_file_name(path):
return Path(path).name

View File

@ -1,24 +1,12 @@
import numpy as np
import numpy as np
from scipy.sparse import lil_matrix, issparse
from sklearn.metrics import f1_score, accuracy_score
"""
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
classified all examples as negatives.
"""
class ContTable:
def __init__(self, tp=0, tn=0, fp=0, fn=0):
self.tp=tp
self.tn=tn
self.fp=fp
self.fn=fn
self.tp = tp
self.tn = tn
self.fp = fp
self.fn = fn
def get_d(self): return self.tp + self.tn + self.fp + self.fn
@ -57,16 +45,20 @@ class ContTable:
def __add__(self, other):
return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn)
def accuracy(cell):
return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn)
def f1(cell):
num = 2.0 * cell.tp
den = 2.0 * cell.tp + cell.fp + cell.fn
if den>0: return num / den
#we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
if den > 0:
return num / den
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
return 1.0
def K(cell):
specificity, recall = 0., 0.
@ -85,45 +77,50 @@ def K(cell):
else:
return specificity + recall - 1.
#computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
#true_labels and predicted_labels are two vectors of shape (number_documents,)
def hard_single_metric_statistics(true_labels, predicted_labels):
assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels."
nd = len(true_labels)
tp = np.sum(predicted_labels[true_labels==1])
fp = np.sum(predicted_labels[true_labels == 0])
fn = np.sum(true_labels[predicted_labels == 0])
tn = nd - (tp+fp+fn)
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
#computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
def __check_consistency_and_adapt(true_labels, predictions):
if predictions.ndim == 1:
return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1))
if true_labels.ndim == 1:
return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1), predictions)
if true_labels.shape != predictions.shape:
raise ValueError("True and predicted label matrices shapes are inconsistent %s %s."
% (true_labels.shape, predictions.shape))
_, nC = true_labels.shape
return true_labels, predictions, nC
# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
# probabilitiesfron with respect to the true binary labels
#true_labels and posterior_probabilities are two vectors of shape (number_documents,)
# true_labels and posterior_probabilities are two vectors of shape (number_documents,)
def soft_single_metric_statistics(true_labels, posterior_probabilities):
assert len(true_labels)==len(posterior_probabilities), "Format not consistent between true and predicted labels."
assert len(true_labels) == len(posterior_probabilities), "Format not consistent between true and predicted labels."
tp = np.sum(posterior_probabilities[true_labels == 1])
fn = np.sum(1. - posterior_probabilities[true_labels == 1])
fp = np.sum(posterior_probabilities[true_labels == 0])
tn = np.sum(1. - posterior_probabilities[true_labels == 0])
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
#if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
#to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
def __check_consistency_and_adapt(true_labels, predictions):
if predictions.ndim == 1:
return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1))
if true_labels.ndim == 1:
return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1),predictions)
if true_labels.shape != predictions.shape:
raise ValueError("True and predicted label matrices shapes are inconsistent %s %s."
% (true_labels.shape, predictions.shape))
_,nC = true_labels.shape
return true_labels, predictions, nC
# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
# true_labels and predicted_labels are two vectors of shape (number_documents,)
def hard_single_metric_statistics(true_labels, predicted_labels):
assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels."
nd = len(true_labels)
tp = np.sum(predicted_labels[true_labels == 1])
fp = np.sum(predicted_labels[true_labels == 0])
fn = np.sum(true_labels[predicted_labels == 0])
tn = nd - (tp+fp+fn)
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)])
def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
@ -134,123 +131,22 @@ def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_
return metric(accum)
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def macroF1(true_labels, predicted_labels):
return macro_average(true_labels,predicted_labels, f1)
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def macroF1(true_labels, predicted_labels):
return macro_average(true_labels, predicted_labels, f1)
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def microF1(true_labels, predicted_labels):
return micro_average(true_labels, predicted_labels, f1)
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def macroK(true_labels, predicted_labels):
return macro_average(true_labels,predicted_labels, K)
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def macroK(true_labels, predicted_labels):
return macro_average(true_labels, predicted_labels, K)
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def microK(true_labels, predicted_labels):
return micro_average(true_labels, predicted_labels, K)
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
#of the same shape containing real values in [0,1]
def smoothmacroF1(true_labels, posterior_probabilities):
return macro_average(true_labels,posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics)
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
#of the same shape containing real values in [0,1]
def smoothmicroF1(true_labels, posterior_probabilities):
return micro_average(true_labels, posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics)
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
#of the same shape containing real values in [0,1]
def smoothmacroK(true_labels, posterior_probabilities):
return macro_average(true_labels,posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
#of the same shape containing real values in [0,1]
def smoothmicroK(true_labels, posterior_probabilities):
return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
"""
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
classified all examples as negatives.
"""
def evaluation(y_true, y_pred, classification_type):
if classification_type == 'multilabel':
eval_function = multilabel_eval
elif classification_type == 'singlelabel':
eval_function = singlelabel_eval
Mf1, mf1, accuracy = eval_function(y_true, y_pred)
return Mf1, mf1, accuracy
def multilabel_eval(y, y_):
tp = y.multiply(y_)
fn = lil_matrix(y.shape)
true_ones = y==1
fn[true_ones]=1-tp[true_ones]
fp = lil_matrix(y.shape)
pred_ones = y_==1
if pred_ones.nnz>0:
fp[pred_ones]=1-tp[pred_ones]
#macro-f1
tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten()
fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten()
fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten()
pos_pred = tp_macro+fp_macro
pos_true = tp_macro+fn_macro
prec=np.zeros(shape=tp_macro.shape,dtype=float)
rec=np.zeros(shape=tp_macro.shape,dtype=float)
np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0)
np.divide(tp_macro, pos_true, out=rec, where=pos_true>0)
den=prec+rec
macrof1=np.zeros(shape=tp_macro.shape,dtype=float)
np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0)
macrof1 *=2
macrof1[(pos_pred==0)*(pos_true==0)]=1
macrof1 = np.mean(macrof1)
#micro-f1
tp_micro = tp_macro.sum()
fn_micro = fn_macro.sum()
fp_micro = fp_macro.sum()
pos_pred = tp_micro + fp_micro
pos_true = tp_micro + fn_micro
prec = (tp_micro / pos_pred) if pos_pred>0 else 0
rec = (tp_micro / pos_true) if pos_true>0 else 0
den = prec+rec
microf1 = 2*prec*rec/den if den>0 else 0
if pos_pred==pos_true==0:
microf1=1
#accuracy
ndecisions = np.multiply(*y.shape)
tn = ndecisions - (tp_micro+fn_micro+fp_micro)
acc = (tp_micro+tn)/ndecisions
return macrof1,microf1,acc
def singlelabel_eval(y, y_):
if issparse(y_): y_ = y_.toarray().flatten()
macrof1 = f1_score(y, y_, average='macro')
microf1 = f1_score(y, y_, average='micro')
acc = accuracy_score(y, y_)
return macrof1,microf1,acc

View File

@ -1,91 +0,0 @@
from optparse import OptionParser
parser = OptionParser(usage="usage: %prog datapath [options]")
parser.add_option("-d", dest='dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset')
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='../log/multiModal_log.csv')
parser.add_option("-X", "--posteriors", dest="posteriors", action='store_true',
help="Add posterior probabilities to the document embedding representation", default=False)
parser.add_option("-W", "--supervised", dest="supervised", action='store_true',
help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
parser.add_option("-M", "--pretrained", dest="pretrained", action='store_true',
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
parser.add_option("-B", "--mbert", dest="mbert", action='store_true',
help="Add multilingual Bert (mBert) document embedding representation", default=False)
parser.add_option('-G', dest='gruViewGenerator', action='store_true',
help="Add document embedding generated via recurrent net (GRU)", default=False)
parser.add_option("--l2", dest="l2", action='store_true',
help="Activates l2 normalization as a post-processing for the document embedding views",
default=False)
parser.add_option("--allprob", dest="allprob", action='store_true',
help="All views are generated as posterior probabilities. This affects the supervised and pretrained"
"embeddings, for which a calibrated classifier is generated, which generates the posteriors",
default=False)
parser.add_option("--feat-weight", dest="feat_weight",
help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf')
parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
parser.add_option("-s", "--set_c", dest="set_c", type=float,
help="Set the C parameter", default=1)
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimize hyperparameters", default=False)
parser.add_option("-j", "--n_jobs", dest="n_jobs", type=int,
help="Number of parallel jobs (default is -1, all)", default=-1)
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
default=300)
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
help="Remove common component when computing dot product of word embedding matrices", default=False)
parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
help="Z-score normalize matrices (WCE and MUSE)", default=False)
parser.add_option("-a", "--agg", dest="agg", action='store_true',
help="Set aggregation function of the common Z-space to average (Default: concatenation)",
default=False)
# ------------------------------------------------------------------------------------
parser.add_option('--hidden', type=int, default=512, metavar='int',
help='hidden lstm size (default: 512)')
parser.add_option('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]',
help='dropout probability for the supervised matrix (default: 0.5)')
parser.add_option('--tunable', action='store_true', default=False,
help='pretrained embeddings are tunable from the beginning (default False, i.e., static)')
parser.add_option('--logfile_gru', dest='logfile_gru', default='../log/log_gru_viewgenerator.csv')
parser.add_option('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
parser.add_option('--force', action='store_true', default=False,
help='do not check if this experiment has already been run')
parser.add_option('--gruMuse', dest='gruMUSE', action='store_true', default=False,
help='Deploy MUSE embedding as embedding layer of the GRU View Generator')
parser.add_option('--gruWce', dest='gruWCE', action='store_true', default=False,
help='Deploy WCE embedding as embedding layer of the GRU View Generator')
parser.add_option('--gru-path', dest='gru_path', default=None,
help='Set the path to a pretrained GRU model (aka, -G view generator)')
parser.add_option('--bert-path', dest='bert_path', default=None,
help='Set the path to a pretrained mBERT model (aka, -B view generator)')

141
src/util/pl_metrics.py Normal file
View File

@ -0,0 +1,141 @@
import torch
from pytorch_lightning.metrics import Metric
from src.util.common import is_false, is_true
def _update(pred, target, device):
assert pred.shape == target.shape
# preparing preds and targets for count
true_pred = is_true(pred, device)
false_pred = is_false(pred, device)
true_target = is_true(target, device)
false_target = is_false(target, device)
tp = torch.sum(true_pred * true_target, dim=0)
tn = torch.sum(false_pred * false_target, dim=0)
fp = torch.sum(true_pred * false_target, dim=0)
fn = torch.sum(false_pred * target, dim=0)
return tp, tn, fp, fn
class CustomF1(Metric):
def __init__(self, num_classes, device, average='micro'):
"""
Custom F1 metric.
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
I.e., when the number of true positives, false positives, and false negatives amount to 0, all
affected metrics (precision, recall, and thus f1) output 0 in Scikit learn.
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
classified all examples as negatives.
:param num_classes:
:param device:
:param average:
"""
super().__init__()
self.num_classes = num_classes
self.average = average
self.device = 'cuda' if device else 'cpu'
self.add_state('true_positive', default=torch.zeros(self.num_classes))
self.add_state('true_negative', default=torch.zeros(self.num_classes))
self.add_state('false_positive', default=torch.zeros(self.num_classes))
self.add_state('false_negative', default=torch.zeros(self.num_classes))
def update(self, preds, target):
true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device)
self.true_positive += true_positive
self.true_negative += true_negative
self.false_positive += false_positive
self.false_negative += false_negative
def compute(self):
if self.average == 'micro':
num = 2.0 * self.true_positive.sum()
den = 2.0 * self.true_positive.sum() + self.false_positive.sum() + self.false_negative.sum()
if den > 0:
return (num / den).to(self.device)
return torch.FloatTensor([1.]).to(self.device)
if self.average == 'macro':
class_specific = []
for i in range(self.num_classes):
class_tp = self.true_positive[i]
class_tn = self.true_negative[i]
class_fp = self.false_positive[i]
class_fn = self.false_negative[i]
num = 2.0 * class_tp
den = 2.0 * class_tp + class_fp + class_fn
if den > 0:
class_specific.append(num / den)
else:
class_specific.append(1.)
average = torch.sum(torch.Tensor(class_specific))/self.num_classes
return average.to(self.device)
class CustomK(Metric):
def __init__(self, num_classes, device, average='micro'):
"""
K metric. https://dl.acm.org/doi/10.1145/2808194.2809449
:param num_classes:
:param device:
:param average:
"""
super().__init__()
self.num_classes = num_classes
self.average = average
self.device = 'cuda' if device else 'cpu'
self.add_state('true_positive', default=torch.zeros(self.num_classes))
self.add_state('true_negative', default=torch.zeros(self.num_classes))
self.add_state('false_positive', default=torch.zeros(self.num_classes))
self.add_state('false_negative', default=torch.zeros(self.num_classes))
def update(self, preds, target):
true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device)
self.true_positive += true_positive
self.true_negative += true_negative
self.false_positive += false_positive
self.false_negative += false_negative
def compute(self):
if self.average == 'micro':
specificity, recall = 0., 0.
absolute_negatives = self.true_negative.sum() + self.false_positive.sum()
if absolute_negatives != 0:
specificity = self.true_negative.sum()/absolute_negatives
absolute_positives = self.true_positive.sum() + self.false_negative.sum()
if absolute_positives != 0:
recall = self.true_positive.sum()/absolute_positives
if absolute_positives == 0:
return 2. * specificity - 1
elif absolute_negatives == 0:
return 2. * recall - 1
else:
return specificity + recall - 1
if self.average == 'macro':
class_specific = []
for i in range(self.num_classes):
class_tp = self.true_positive[i]
class_tn = self.true_negative[i]
class_fp = self.false_positive[i]
class_fn = self.false_negative[i]
specificity, recall = 0., 0.
absolute_negatives = class_tn + class_fp
if absolute_negatives != 0:
specificity = class_tn / absolute_negatives
absolute_positives = class_tp + class_fn
if absolute_positives != 0:
recall = class_tp / absolute_positives
if absolute_positives == 0:
class_specific.append(2. * specificity - 1)
elif absolute_negatives == 0:
class_specific.append(2. * recall - 1)
else:
class_specific.append(specificity + recall - 1)
average = torch.sum(torch.Tensor(class_specific)) / self.num_classes
return average.to(self.device)

View File

@ -1,21 +1,21 @@
import os
import pandas as pd
import numpy as np
class PolylingualClassificationResults:
import numpy as np
import pandas as pd
class CSVlog:
def __init__(self, file, autoflush=True, verbose=False):
self.file = file
self.columns = ['method',
'learner',
'optimp',
'setting',
'optimc',
'sif',
'zscore',
'l2',
'wescaler',
'pca',
'id',
'dataset',
'time',
'time_tr',
'time_te',
'lang',
'macrof1',
'microf1',
@ -36,8 +36,11 @@ class PolylingualClassificationResults:
def already_calculated(self, id):
return (self.df['id'] == id).any()
def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
def add_row(self, method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang,
macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
s = pd.Series([method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang,
macrof1, microf1, macrok, microk, notes],
index=self.columns)
self.df = self.df.append(s, ignore_index=True)
if self.autoflush: self.flush()
self.tell(s.to_string())
@ -46,4 +49,5 @@ class PolylingualClassificationResults:
self.df.to_csv(self.file, index=False, sep='\t')
def tell(self, msg):
if self.verbose: print(msg)
if self.verbose:
print(msg)

View File

@ -1,15 +1,20 @@
import numpy as np
class StandardizeTransformer:
class StandardizeTransformer:
def __init__(self, axis=0, range=None):
"""
:param axis:
:param range:
"""
assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice'
self.axis = axis
self.yetfit = False
self.range = range
def fit(self, X):
print('fitting Standardizer...')
print('Applying z-score standardization...')
std=np.std(X, axis=self.axis, ddof=1)
self.std = np.clip(std, 1e-5, None)
self.mean = np.mean(X, axis=self.axis)
@ -28,4 +33,4 @@ class StandardizeTransformer:
return (X - self.mean) / self.std
def fit_transform(self, X):
return self.fit(X).transform(X)
return self.fit(X).transform(X)

View File

@ -1,29 +0,0 @@
from sklearn.svm import SVC
from tqdm import tqdm
import re
import sys
def mask_numbers(data, number_mask='numbermask'):
mask = re.compile(r'\b[0-9][0-9.,-]*\b')
masked = []
for text in tqdm(data, desc='masking numbers'):
masked.append(mask.sub(number_mask, text))
return masked
def fill_missing_classes(lXtr, lytr):
pass
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
def get_params(dense=False):
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf' if dense else 'linear'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]

View File

@ -1,110 +0,0 @@
import numpy as np
import sklearn
# from sklearn.externals.joblib import Parallel, delayed
from joblib import Parallel, delayed
class ESA(object):
"""
Implementation of Explicit Sematic Analysis (ESA) in its mono-lingual version, as a transformer
"""
supported_similarity = ['dot', 'cosine']
def __init__(self, similarity='dot', centered=False, post=None):
"""
:param similarity: the similarity measure between documents to be used
:param centered: set to True to subtract the expected similarity due to randomness (experimental)
:param post: any valid sklearn normalization method to be applied to the resulting doc embeddings, or None (default)
"""
assert similarity in self.supported_similarity, ("Similarity method %s is not supported" % similarity)
self.similarity = similarity
self.centered = centered
self.post_processing = post
self.W = None
def fit(self, W):
"""
:param W: doc-by-term already processed matrix of wikipedia documents
:return: self
"""
self.W = W
return self
def transform(self, X):
"""
:param X: doc-by-term matrix that is to be transformed into the ESA space.
:return: the matrix X transformed into the ESA space in numpy format
"""
assert self.W is not None, 'transform method called before fit'
W = self.W
assert X.shape[1] == W.shape[1], ('the feature spaces for X=%s and W=%s do not agree' % (str(X.shape), str(W.shape)))
if self.similarity in ['dot', 'cosine']:
if self.similarity == 'cosine':
X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True)
W = sklearn.preprocessing.normalize(W, norm='l2', axis=1, copy=True)
esa = (X.dot(W.T)).toarray()
if self.centered:
pX = (X > 0).sum(1) / float(X.shape[1])
pW = (W > 0).sum(1) / float(W.shape[1])
pXpW = np.sqrt(pX.dot(pW.transpose()))
esa = esa - pXpW
if self.post_processing:
esa = sklearn.preprocessing.normalize(esa, norm=self.post_processing, axis=1, copy=True)
return esa
def fit_transform(self, W, X, Y=None):
self.fit(W)
return self.transform(X, Y)
def dimensionality(self):
return self.W.shape[0]
class CLESA(ESA):
"""
Implementation of Cross-Lingual Explicit Sematic Analysis (ESA) as a transformer
"""
def __init__(self, similarity='dot', centered=False, post=False, n_jobs=-1):
super(CLESA, self).__init__(similarity, centered, post)
self.lESA = None
self.langs = None
self.n_jobs = n_jobs
def fit(self, lW):
"""
:param lW: a dictionary of {language: doc-by-term wiki matrix}
:return: self
"""
assert len(np.unique([W.shape[0] for W in lW.values()])) == 1, "inconsistent dimensions across languages"
self.dimensions = list(lW.values())[0].shape[0]
self.langs = list(lW.keys())
self.lESA = {lang:ESA(self.similarity, self.centered, self.post_processing).fit(lW[lang]) for lang in self.langs}
return self
def transform(self, lX):
"""
:param lX: dictionary of {language : doc-by-term matrix} that is to be transformed into the CL-ESA space
:return: a dictionary {language : doc-by-dim matrix} containing the matrix-transformed versions
"""
assert self.lESA is not None, 'transform method called before fit'
assert set(lX.keys()).issubset(set(self.langs)), 'languages in lX are not scope'
langs = list(lX.keys())
trans = Parallel(n_jobs=self.n_jobs)(delayed(self.lESA[lang].transform)(lX[lang]) for lang in langs)
return {lang:trans[i] for i,lang in enumerate(langs)}
def fit_transform(self, lW, lX):
return self.fit(lW).transform(lX)
def languages(self):
return list(self.lESA.keys())

View File

@ -1,154 +0,0 @@
import numpy as np
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix, issparse
from scipy.spatial.distance import cosine
import operator
import functools
import math, sys
# from sklearn.externals.joblib import Parallel, delayed
from joblib import Parallel, delayed
class DistributionalCorrespondenceIndexing:
prob_dcf = ['linear', 'pmi']
vect_dcf = ['cosine']
valid_dcf = prob_dcf + vect_dcf
valid_post = ['normal', 'l2', None]
def __init__(self, dcf='cosine', post='normal', n_jobs=-1):
"""
:param dcf: a distributional correspondence function name (e.g., 'cosine') or a callable f(u,v) which measures
the distribucional correspondence between vectors u and v
:param post: post-processing function to apply to document embeddings. Default is to standardize it into a
normal distribution; other functions allowed are 'l2' or None
"""
if post not in self.valid_post:
raise ValueError("unknown post processing function; valid ones are [%s]" % ', '.join(self.valid_post))
if isinstance(dcf, str):
if dcf not in self.valid_dcf:
raise ValueError("unknown dcf; use any in [%s]" % ', '.join(self.valid_dcf))
self.dcf = getattr(DistributionalCorrespondenceIndexing, dcf)
elif hasattr(dcf, '__call__'):
self.dcf = dcf
else:
raise ValueError('param dcf should either be a valid dcf name in [%s] or a callable comparing two vectors')
#self.dcf = lambda u,v:dcf(u,v)
self.post = post
self.domains = None
self.dFP = None
self.n_jobs = n_jobs
def fit(self, dU, dP):
"""
:param dU: a dictionary of {domain:dsm_matrix}, where dsm is a document-by-term matrix representing the
distributional semantic model for a specific domain
:param dP: a dictionary {domain:pivot_matrix} where domain is a string representing each domain,
and pivot_matrix has shape (d,p) with d the dimensionality of the distributional space, and p the
number of pivots
:return: self
"""
self.domains = list(dP.keys())
assert len(np.unique([P.shape[1] for P in dP.values()]))==1, "inconsistent number of pivots across domains"
assert set(dU.keys())==set(self.domains), "inconsistent domains in dU and dP"
assert not [1 for d in self.domains if dU[d].shape[0]!=dP[d].shape[0]], \
"inconsistent dimensions between distributional and pivot spaces"
self.dimensions = list(dP.values())[0].shape[1]
# embed the feature space from each domain using the pivots of that domain
#self.dFP = {d:self.dcf_dist(dU[d].transpose(), dP[d].transpose()) for d in self.domains}
transformations = Parallel(n_jobs=self.n_jobs)(delayed(self.dcf_dist)(dU[d].transpose(),dP[d].transpose()) for d in self.domains)
self.dFP = {d: transformations[i] for i, d in enumerate(self.domains)}
def _dom_transform(self, X, FP):
_X = X.dot(FP)
if self.post == 'l2':
_X = normalize(_X, norm='l2', axis=1)
elif self.post == 'normal':
std = np.clip(np.std(_X, axis=0), 1e-5, None)
_X = (_X - np.mean(_X, axis=0)) / std
return _X
# dX is a dictionary of {domain:dsm}, where dsm (distributional semantic model) is, e.g., a document-by-term csr_matrix
def transform(self, dX):
assert self.dFP is not None, 'transform method called before fit'
assert set(dX.keys()).issubset(self.domains), 'domains in dX are not scope'
domains = list(dX.keys())
transformations = Parallel(n_jobs=self.n_jobs)(delayed(self._dom_transform)(dX[d], self.dFP[d]) for d in domains)
return {d: transformations[i] for i, d in enumerate(domains)}
def fit_transform(self, dU, dP, dX):
return self.fit(dU, dP).transform(dX)
def _prevalence(self, v):
if issparse(v):
return float(v.nnz) / functools.reduce(operator.mul, v.shape, 1) #this works for arrays of any rank
elif isinstance(v, np.ndarray):
return float(v[v>0].size) / v.size
def linear(self, u, v, D):
tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
den1=tp+fn
den2=tn+fp
tpr = (tp*1./den1) if den1!=0 else 0.
tnr = (tn*1./den2) if den2!=0 else 0.
return tpr + tnr - 1
def pmi(self, u, v, D):
tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
Pxy = tp * 1. / D
Pxny = fp * 1. / D
Pnxy = fn * 1. / D
Px = Pxy + Pxny
Py = Pxy + Pnxy
if (Px == 0 or Py == 0 or Pxy == 0):
return 0.0
score = math.log2(Pxy / (Px * Py))
if np.isnan(score) or np.isinf(score):
print('NAN')
sys.exit()
return score
def cosine(self, u, v):
pu = self._prevalence(u)
pv = self._prevalence(v)
return cosine(u, v) - np.sqrt(pu * pv)
def _get_4cellcounters(self, u, v, D):
"""
:param u: a set of indexes with a non-zero value
:param v: a set of indexes with a non-zero value
:param D: the number of events (i.e., all posible indexes)
:return: the 4-cell contingency values tp, fp, fn, tn)
"""
common=u.intersection(v)
tp = len(common)
fp = len(u) - len(common)
fn = len(v) - len(common)
tn = D - (tp + fp + fn)
return tp, fp, fn, tn
def dcf_dist(self, U, V):
nU,D = U.shape
nV = V.shape[0]
if issparse(U): U = U.toarray()
if issparse(V): V = V.toarray()
dists = np.zeros((nU, nV))
if self.dcf.__name__ in self.prob_dcf:
def hits_index(v):
return set(np.argwhere(v>0).reshape(-1).tolist())
Vhits = {i:hits_index(V[i]) for i in range(nV)}
for i in range(nU):
Ui_hits = hits_index(U[i])
for j in range(nV):
dists[i, j] = self.dcf(self, Ui_hits, Vhits[j], D)
else:
for i in range(nU):
for j in range(nV):
dists[i, j] = self.dcf(self, U[i], V[j])
return dists

View File

@ -1,53 +0,0 @@
import math
import numpy as np
from scipy.sparse import csr_matrix, issparse
class RandomIndexingBoC(object):
def __init__(self, latent_dimensions, non_zeros=2):
self.latent_dimensions = latent_dimensions
self.k = non_zeros
self.ri_dict = None
def fit_transform(self, X):
return self.fit(X).transform(X)
def fit(self, X):
nF = X.shape[1]
nL = self.latent_dimensions
format = 'csr' if issparse(X) else 'np'
self.ri_dict = _create_random_index_dictionary(shape=(nF, nL), k=self.k, normalized=True, format=format)
return self
def transform(self, X):
assert X.shape[1] == self.ri_dict.shape[0], 'feature space is inconsistent with the RI dictionary'
if self.ri_dict is None:
raise ValueError("Error: transform method called before fit.")
P = X.dot(self.ri_dict)
if issparse(P):
P.sort_indices()
return P
def _create_random_index_dictionary(shape, k, normalized=False, format='csr', positive=False):
assert format in ['csr', 'np'], 'Format should be in "[csr, np]"'
nF, latent_dimensions = shape
print("Creating the random index dictionary for |V|={} with {} dimensions".format(nF,latent_dimensions))
val = 1.0 if not normalized else 1.0/math.sqrt(k)
#ri_dict = csr_matrix((nF, latent_dimensions)) if format == 'csr' else np.zeros((nF, latent_dimensions))
ri_dict = np.zeros((nF, latent_dimensions))
#TODO: optimize
for t in range(nF):
dims = np.zeros(k, dtype=np.int32)
dims[0] = t % latent_dimensions #the first dimension is choosen in a round-robin manner (prevents gaps)
dims[1:] = np.random.choice(latent_dimensions, size=k-1, replace=False)
values = (np.random.randint(0,2, size=k)*2.0-1.0) * val if not positive else np.array([+val]*k)
ri_dict[t,dims]=values
print("\rprogress [%.2f%% complete]" % (t * 100.0 / nF), end='')
print('\nDone')
if format=='csr':
ri_dict = csr_matrix(ri_dict)
return ri_dict

388
src/view_generators.py Normal file
View File

@ -0,0 +1,388 @@
"""
This module contains the view generators that take care of computing the view specific document embeddings:
- VanillaFunGen (-x) cast document representations encoded via TFIDF into posterior probabilities by means of SVM.
- WordClassGen (-w): generates document representation via Word-Class-Embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings.
- MuseGen (-m): generates document representation via MUSE embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings.
- RecurrentGen (-g): generates document embedding by means of a Gated Recurrent Units. The model can be
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
Output dimension is (n_docs, 512).
- View generator (-b): generates document embedding via mBERT model.
"""
from abc import ABC, abstractmethod
# from time import time
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize
from src.models.learners import *
from src.models.pl_bert import BertModel
from src.models.pl_gru import RecurrentModel
from src.util.common import TfidfVectorizerMultilingual, _normalize, index
from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix
from src.util.file import create_if_not_exist
# TODO: (1) add model checkpointing and loading from checkpoint + training on validation after convergence is reached
class ViewGen(ABC):
"""
Abstract class for ViewGenerators implementations. Every ViewGen should implement these three methods in order to
be seamlessly integrated in the overall architecture.
"""
@abstractmethod
def fit(self, lX, ly):
pass
@abstractmethod
def transform(self, lX):
pass
@abstractmethod
def fit_transform(self, lX, ly):
pass
class VanillaFunGen(ViewGen):
"""
View Generator (x): original funnelling architecture proposed by Moreo, Esuli and
Sebastiani in DOI: https://doi.org/10.1145/3326065
"""
def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1):
"""
Init Posterior Probabilities embedder (i.e., VanillaFunGen)
:param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to
return posterior probabilities.
:param base_learner:
:param n_jobs: integer, number of concurrent workers
"""
super().__init__()
self.learners = base_learner
self.first_tier_parameters = first_tier_parameters
self.n_jobs = n_jobs
self.doc_projector = NaivePolylingualClassifier(base_learner=self.learners,
parameters=self.first_tier_parameters, n_jobs=self.n_jobs)
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
def fit(self, lX, lY):
print('# Fitting VanillaFunGen (X)...')
lX = self.vectorizer.fit_transform(lX)
self.doc_projector.fit(lX, lY)
return self
def transform(self, lX):
"""
(1) Vectorize documents; (2) Project them according to the learners SVMs, finally (3) Apply L2 normalization
to the projection and returns it.
:param lX: dict {lang: indexed documents}
:return: document projection to the common latent space.
"""
lX = self.vectorizer.transform(lX)
lZ = self.doc_projector.predict_proba(lX)
lZ = _normalize(lZ, l2=True)
return lZ
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
class MuseGen(ViewGen):
"""
View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
"""
def __init__(self, muse_dir='../embeddings', n_jobs=-1):
"""
Init the MuseGen.
:param muse_dir: string, path to folder containing muse embeddings
:param n_jobs: int, number of concurrent workers
"""
super().__init__()
self.muse_dir = muse_dir
self.n_jobs = n_jobs
self.langs = None
self.lMuse = None
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
def fit(self, lX, ly):
"""
(1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing.
:param lX: dict {lang: indexed documents}
:param ly: dict {lang: target vectors}
:return: self.
"""
print('# Fitting MuseGen (M)...')
self.vectorizer.fit(lX)
self.langs = sorted(lX.keys())
self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir)
lVoc = self.vectorizer.vocabulary()
self.lMuse = self.lMuse.extract(lVoc) # overwriting lMuse with dict {lang : embed_matrix} with only known words
# TODO: featureweight.fit
return self
def transform(self, lX):
"""
(1) Vectorize documents; (2) computes the weighted sum of MUSE embeddings found at document level,
finally (3) Apply L2 normalization embedding and returns it.
:param lX: dict {lang: indexed documents}
:return: document projection to the common latent space.
"""
lX = self.vectorizer.transform(lX)
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs)
lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)}
lZ = _normalize(lZ, l2=True)
return lZ
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
class WordClassGen(ViewGen):
"""
View Generator (w): generates document representation via Word-Class-Embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings.
"""
def __init__(self, n_jobs=-1):
"""
Init WordClassGen.
:param n_jobs: int, number of concurrent workers
"""
super().__init__()
self.n_jobs = n_jobs
self.langs = None
self.lWce = None
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
def fit(self, lX, ly):
"""
(1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing.
:param lX: dict {lang: indexed documents}
:param ly: dict {lang: target vectors}
:return: self.
"""
print('# Fitting WordClassGen (W)...')
lX = self.vectorizer.fit_transform(lX)
self.langs = sorted(lX.keys())
wce = Parallel(n_jobs=self.n_jobs)(
delayed(wce_matrix)(lX[lang], ly[lang]) for lang in self.langs)
self.lWce = {l: wce[i] for i, l in enumerate(self.langs)}
# TODO: featureweight.fit()
return self
def transform(self, lX):
"""
(1) Vectorize documents; (2) computes the weighted sum of Word-Class Embeddings found at document level,
finally (3) Apply L2 normalization embedding and returns it.
:param lX: dict {lang: indexed documents}
:return: document projection to the common latent space.
"""
lX = self.vectorizer.transform(lX)
XdotWce = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs)
lWce = {l: XdotWce[i] for i, l in enumerate(self.langs)}
lWce = _normalize(lWce, l2=True)
return lWce
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
class RecurrentGen(ViewGen):
"""
View Generator (G): generates document embedding by means of a Gated Recurrent Units. The model can be
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
Output dimension is (n_docs, 512). The training will happen end-to-end. At inference time, the model returns
the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard.
"""
def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50,
gpus=0, n_jobs=-1, patience=20, stored_path=None):
"""
Init RecurrentGen.
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
indexed by language code.
:param pretrained_embeddings: dict {lang: tensor of embeddings}, it contains the pretrained embeddings to use
as embedding layer.
:param wce: Bool, whether to deploy Word-Class Embeddings (as proposed by A. Moreo). If True, supervised
embeddings are concatenated to the deployed supervised embeddings. WCE dimensionality is equal to
the number of target classes.
:param batch_size: int, number of samples in a batch.
:param nepochs: int, number of max epochs to train the model.
:param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu.
:param n_jobs: int, number of concurrent workers (i.e., parallelizing data loading).
:param patience: int, number of epochs with no improvements in val-macroF1 before early stopping.
:param stored_path: str, path to a pretrained model. If None the model will be trained from scratch.
"""
super().__init__()
self.multilingualIndex = multilingualIndex
self.langs = multilingualIndex.langs
self.batch_size = batch_size
self.gpus = gpus
self.n_jobs = n_jobs
self.stored_path = stored_path
self.nepochs = nepochs
self.patience = patience
# EMBEDDINGS to be deployed
self.pretrained = pretrained_embeddings
self.wce = wce
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce)
self.model = self._init_model()
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False)
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
patience=self.patience, verbose=False, mode='max')
self.lr_monitor = LearningRateMonitor(logging_interval='epoch')
def _init_model(self):
if self.stored_path:
lpretrained = self.multilingualIndex.l_embeddings()
return RecurrentModel.load_from_checkpoint(self.stored_path, lPretrained=lpretrained)
else:
lpretrained = self.multilingualIndex.l_embeddings()
langs = self.multilingualIndex.langs
output_size = self.multilingualIndex.get_target_dim()
hidden_size = 512
lvocab_size = self.multilingualIndex.l_vocabsize()
learnable_length = 0
return RecurrentModel(
lPretrained=lpretrained,
langs=langs,
output_size=output_size,
hidden_size=hidden_size,
lVocab_size=lvocab_size,
learnable_length=learnable_length,
drop_embedding_range=self.multilingualIndex.sup_range,
drop_embedding_prop=0.5,
gpus=self.gpus
)
def fit(self, lX, ly):
"""
Train the Neural Network end-to-end.
lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation
of the Dataset object (RecurrentDataset) in the GfunDataModule class.
:param lX: dict {lang: indexed documents}
:param ly: dict {lang: target vectors}
:return: self.
"""
print('# Fitting RecurrentGen (G)...')
create_if_not_exist(self.logger.save_dir)
recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs)
trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs,
callbacks=[self.early_stop_callback, self.lr_monitor], checkpoint_callback=False)
# vanilla_torch_model = torch.load(
# '../_old_checkpoint/gru_viewgen_-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle')
# self.model.linear0 = vanilla_torch_model.linear0
# self.model.linear1 = vanilla_torch_model.linear1
# self.model.linear2 = vanilla_torch_model.linear2
# self.model.rnn = vanilla_torch_model.rnn
trainer.fit(self.model, datamodule=recurrentDataModule)
trainer.test(self.model, datamodule=recurrentDataModule)
return self
def transform(self, lX):
"""
Project documents to the common latent space. Output dimensionality is 512.
:param lX: dict {lang: indexed documents}
:return: documents projected to the common latent space.
"""
data = {}
for lang in lX.keys():
indexed = index(data=lX[lang],
vocab=self.multilingualIndex.l_index[lang].word2index,
known_words=set(self.multilingualIndex.l_index[lang].word2index.keys()),
analyzer=self.multilingualIndex.l_vectorizer.get_analyzer(lang),
unk_index=self.multilingualIndex.l_index[lang].unk_index,
out_of_vocabulary=self.multilingualIndex.l_index[lang].out_of_vocabulary)
data[lang] = indexed
l_pad = self.multilingualIndex.l_pad()
self.model.to('cuda' if self.gpus else 'cpu')
self.model.eval()
l_embeds = self.model.encode(data, l_pad, batch_size=256)
return l_embeds
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
class BertGen(ViewGen):
"""
View Generator (b): generates document embedding via Bert model. The training happens end-to-end.
At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document
embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard.
"""
def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, patience=5, stored_path=None):
"""
Init Bert model
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
indexed by language code.
:param batch_size: int, number of samples per batch.
:param nepochs: int, number of max epochs to train the model.
:param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu.
:param patience: int, number of epochs with no improvements in val-macroF1 before early stopping.
:param n_jobs: int, number of concurrent workers.
:param stored_path: str, path to a pretrained model. If None the model will be trained from scratch.
"""
super().__init__()
self.multilingualIndex = multilingualIndex
self.nepochs = nepochs
self.gpus = gpus
self.batch_size = batch_size
self.n_jobs = n_jobs
self.stored_path = stored_path
self.model = self._init_model()
self.patience = patience
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False)
self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00,
patience=self.patience, verbose=False, mode='max')
def _init_model(self):
output_size = self.multilingualIndex.get_target_dim()
return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus)
def fit(self, lX, ly):
"""
Train the Neural Network end-to-end.
lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation
of the Dataset object (RecurrentDataset) in the GfunDataModule class.
:param lX: dict {lang: indexed documents}
:param ly: dict {lang: target vectors}
:return: self.
"""
print('# Fitting BertGen (M)...')
create_if_not_exist(self.logger.save_dir)
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512)
trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus,
logger=self.logger, callbacks=[self.early_stop_callback], checkpoint_callback=False)
trainer.fit(self.model, datamodule=bertDataModule)
trainer.test(self.model, datamodule=bertDataModule)
return self
def transform(self, lX):
"""
Project documents to the common latent space. Output dimensionality is 768.
:param lX: dict {lang: indexed documents}
:return: documents projected to the common latent space.
"""
data = tokenize(lX, max_len=512)
self.model.to('cuda' if self.gpus else 'cpu')
self.model.eval()
l_embeds = self.model.encode(data, batch_size=64)
return l_embeds
def fit_transform(self, lX, ly):
# we can assume that we have already indexed data for transform() since we are first calling fit()
return self.fit(lX, ly).transform(lX)