diff --git a/main.py b/main.py new file mode 100644 index 0000000..49d450d --- /dev/null +++ b/main.py @@ -0,0 +1,190 @@ +from argparse import ArgumentParser + +from src.data.dataset_builder import MultilingualDataset +from src.funnelling import * +from src.util.common import MultilingualIndex, get_params, get_method_name +from src.util.evaluation import evaluate +from src.util.results_csv import CSVlog +from src.view_generators import * + + +def main(args): + assert args.post_embedder or args.muse_embedder or args.wce_embedder or args.gru_embedder or args.bert_embedder, \ + 'empty set of document embeddings is not allowed!' + + print('Running generalized funnelling...') + + data = MultilingualDataset.load(args.dataset) + # data.set_view(languages=['it', 'da']) + data.show_dimensions() + lX, ly = data.training() + lXte, lyte = data.test() + + # Init multilingualIndex - mandatory when deploying Neural View Generators... + if args.gru_embedder or args.bert_embedder: + multilingualIndex = MultilingualIndex() + lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir) + multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) + + # Init ViewGenerators and append them to embedder_list + embedder_list = [] + if args.post_embedder: + posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs) + embedder_list.append(posteriorEmbedder) + + if args.muse_embedder: + museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs) + embedder_list.append(museEmbedder) + + if args.wce_embedder: + wceEmbedder = WordClassGen(n_jobs=args.n_jobs) + embedder_list.append(wceEmbedder) + + if args.gru_embedder: + rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.rnn_wce, + batch_size=args.batch_rnn, nepochs=args.nepochs_rnn, patience=args.patience_rnn, + gpus=args.gpus, n_jobs=args.n_jobs) + embedder_list.append(rnnEmbedder) + + if args.bert_embedder: + bertEmbedder = BertGen(multilingualIndex, batch_size=args.batch_bert, nepochs=args.nepochs_bert, + patience=args.patience_bert, gpus=args.gpus, n_jobs=args.n_jobs) + bertEmbedder.transform(lX) + embedder_list.append(bertEmbedder) + + # Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier + docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True) + meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), + meta_parameters=get_params(optimc=args.optimc)) + + # Init Funnelling Architecture + gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta) + + # Training --------------------------------------- + print('\n[Training Generalized Funnelling]') + time_init = time.time() + gfun.fit(lX, ly) + time_tr = round(time.time() - time_init, 3) + print(f'Training completed in {time_tr} seconds!') + + # Testing ---------------------------------------- + print('\n[Testing Generalized Funnelling]') + time_te = time.time() + ly_ = gfun.predict(lXte) + l_eval = evaluate(ly_true=lyte, ly_pred=ly_) + time_te = round(time.time() - time_te, 3) + print(f'Testing completed in {time_te} seconds!') + + # Logging --------------------------------------- + print('\n[Results]') + results = CSVlog(args.csv_dir) + metrics = [] + for lang in lXte.keys(): + macrof1, microf1, macrok, microk = l_eval[lang] + metrics.append([macrof1, microf1, macrok, microk]) + print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}') + if results is not None: + _id, _dataset = get_method_name(args) + results.add_row(method='gfun', + setting=_id, + optimc=args.optimc, + sif='True', + zscore='True', + l2='True', + dataset=_dataset, + time_tr=time_tr, + time_te=time_te, + lang=lang, + macrof1=macrof1, + microf1=microf1, + macrok=macrok, + microk=microk, + notes='') + print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) + + overall_time = round(time.time() - time_init, 3) + exit(f'\nExecuted in: {overall_time} seconds!') + + +if __name__ == '__main__': + parser = ArgumentParser(description='Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani') + + parser.add_argument('dataset', help='Path to the dataset') + + parser.add_argument('-o', '--output', dest='csv_dir', metavar='', + help='Result file (default csv_logs/gfun/gfun_results.csv)', type=str, + default='csv_logs/gfun/gfun_results.csv') + + parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true', + help='deploy posterior probabilities embedder to compute document embeddings', + default=False) + + parser.add_argument('-w', '--wce_embedder', dest='wce_embedder', action='store_true', + help='deploy (supervised) Word-Class embedder to the compute document embeddings', + default=False) + + parser.add_argument('-m', '--muse_embedder', dest='muse_embedder', action='store_true', + help='deploy (pretrained) MUSE embedder to compute document embeddings', + default=False) + + parser.add_argument('-b', '--bert_embedder', dest='bert_embedder', action='store_true', + help='deploy multilingual Bert to compute document embeddings', + default=False) + + parser.add_argument('-g', '--gru_embedder', dest='gru_embedder', action='store_true', + help='deploy a GRU in order to compute document embeddings (a.k.a., RecurrentGen)', + default=False) + + parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true', + help='Optimize SVMs C hyperparameter at metaclassifier level', + default=False) + + parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, metavar='', + help='number of parallel jobs (default is -1, all)', + default=-1) + + parser.add_argument('--nepochs_rnn', dest='nepochs_rnn', type=int, metavar='', + help='number of max epochs to train Recurrent embedder (i.e., -g), default 150', + default=150) + + parser.add_argument('--nepochs_bert', dest='nepochs_bert', type=int, metavar='', + help='number of max epochs to train Bert model (i.e., -g), default 10', + default=10) + + parser.add_argument('--patience_rnn', dest='patience_rnn', type=int, metavar='', + help='set early stop patience for the RecurrentGen, default 25', + default=25) + + parser.add_argument('--patience_bert', dest='patience_bert', type=int, metavar='', + help='set early stop patience for the BertGen, default 5', + default=5) + + parser.add_argument('--batch_rnn', dest='batch_rnn', type=int, metavar='', + help='set batchsize for the RecurrentGen, default 64', + default=64) + + parser.add_argument('--batch_bert', dest='batch_bert', type=int, metavar='', + help='set batchsize for the BertGen, default 4', + default=4) + + parser.add_argument('--muse_dir', dest='muse_dir', type=str, metavar='', + help='Path to the MUSE polylingual word embeddings (default embeddings/)', + default='embeddings/') + + parser.add_argument('--rnn_wce', dest='rnn_wce', action='store_true', + help='Deploy WCE embedding as embedding layer of the RecurrentGen', + default=False) + + parser.add_argument('--rnn_dir', dest='rnn_dir', type=str, metavar='', + help='Set the path to a pretrained RNN model (i.e., -g view generator)', + default=None) + + parser.add_argument('--bert_dir', dest='bert_dir', type=str, metavar='', + help='Set the path to a pretrained mBERT model (i.e., -b view generator)', + default=None) + + parser.add_argument('--gpus', metavar='', help='specifies how many GPUs to use per node', + default=None) + + args = parser.parse_args() + main(args) diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..d32fb61 --- /dev/null +++ b/readme.md @@ -0,0 +1,52 @@ +# Generalized Funnelling (gFun) + +## Requirements +```commandline +transformers==2.11.0 +pandas==0.25.3 +numpy==1.17.4 +joblib==0.14.0 +tqdm==4.50.2 +pytorch_lightning==1.1.2 +torch==1.3.1 +nltk==3.4.5 +scipy==1.3.3 +rdflib==4.2.2 +torchtext==0.4.0 +scikit_learn==0.24.1 +``` + +## Usage +```commandline +usage: main.py [-h] [-o CSV_DIR] [-x] [-w] [-m] [-b] [-g] [-c] [-n NEPOCHS] + [-j N_JOBS] [--muse_dir MUSE_DIR] [--gru_wce] + [--gru_dir GRU_DIR] [--bert_dir BERT_DIR] [--gpus GPUS] + dataset + +Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani (2020). + +positional arguments: + dataset Path to the dataset + +optional arguments: + -h, --help show this help message and exit + -o, --output result file (default ../csv_logs/gfun/gfun_results.csv) + -x, --post_embedder deploy posterior probabilities embedder to compute document embeddings + -w, --wce_embedder deploy (supervised) Word-Class embedder to the compute document embeddings + -m, --muse_embedder deploy (pretrained) MUSE embedder to compute document embeddings + -b, --bert_embedder deploy multilingual Bert to compute document embeddings + -g, --gru_embedder deploy a GRU in order to compute document embeddings + -c, --c_optimize optimize SVMs C hyperparameter + -j, --n_jobs number of parallel jobs, default is -1 i.e., all + --nepochs_rnn number of max epochs to train Recurrent embedder (i.e., -g), default 150 + --nepochs_bert number of max epochs to train Bert model (i.e., -g), default 10 + --patience_rnn set early stop patience for the RecurrentGen, default 25 + --patience_bert set early stop patience for the BertGen, default 5 + --batch_rnn set batchsize for the RecurrentGen, default 64 + --batch_bert set batchsize for the BertGen, default 4 + --muse_dir path to the MUSE polylingual word embeddings (default ../embeddings) + --gru_wce deploy WCE embedding as embedding layer of the GRU View Generator + --rnn_dir set the path to a pretrained RNN model (i.e., -g view generator) + --bert_dir set the path to a pretrained mBERT model (i.e., -b view generator) + --gpus specifies how many GPUs to use per node +``` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4546a4a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +transformers==2.11.0 +pandas==0.25.3 +numpy==1.17.4 +joblib==0.14.0 +tqdm==4.50.2 +pytorch_lightning==1.1.2 +torch==1.3.1 +nltk==3.4.5 +scipy==1.3.3 +rdflib==4.2.2 +torchtext==0.4.0 +scikit_learn==0.24.1 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..fd7f4f0 --- /dev/null +++ b/run.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -g --gpus 0 + +#for i in {0..10..1} +#do +# python main.py --gpus 0 +#done \ No newline at end of file diff --git a/src/data/datamodule.py b/src/data/datamodule.py new file mode 100644 index 0000000..66146b3 --- /dev/null +++ b/src/data/datamodule.py @@ -0,0 +1,222 @@ +import numpy as np +import pytorch_lightning as pl +import torch +from torch.utils.data import Dataset, DataLoader +from transformers import BertTokenizer + +N_WORKERS = 8 + + +class RecurrentDataset(Dataset): + def __init__(self, lX, ly, lPad_index): + """ + :param lX: dict {lang_id : np.ndarray} + :param ly: + """ + self.lX = [] + self.ly = [] + self.lOffset = {} + self.lPad_index = lPad_index + + for lang, data in lX.items(): + offset = [len(self.lX)] + self.lX.extend(data) + offset.append(len(self.lX)) + self.lOffset[lang] = offset + + for lang, target in ly.items(): + self.ly.extend(target) + + def __len__(self): + return len(self.lX) + + def __getitem__(self, index): + X = self.lX[index] + y = self.ly[index] + return X, y, index, self._get_lang(index) + + def _get_lang(self, index): + for lang, l_range in self.lOffset.items(): + if index in range(l_range[0], l_range[1]): + return lang + + def collate_fn(self, data): + """ + Takes care of padding the batch and also check consistency of batch languages. Groups into dict {lang : lang_batch} + items sampled from the Dataset class. + :param data: + :return: + """ + lX_batch = {} + ly_batch = {} + current_lang = data[0][-1] + for d in data: + if d[-1] == current_lang: + if current_lang not in lX_batch.keys(): + lX_batch[current_lang] = [] + ly_batch[current_lang] = [] + lX_batch[current_lang].append(d[0]) + ly_batch[current_lang].append(d[1]) + else: + current_lang = d[-1] + lX_batch[current_lang] = [] + ly_batch[current_lang] = [] + lX_batch[current_lang].append(d[0]) + ly_batch[current_lang].append(d[1]) + + for lang in lX_batch.keys(): + lX_batch[lang] = self.pad(lX_batch[lang], pad_index=self.lPad_index[lang], + max_pad_length=self.define_pad_length(lX_batch[lang])) + lX_batch[lang] = torch.LongTensor(lX_batch[lang]) + ly_batch[lang] = torch.FloatTensor(ly_batch[lang]) + + return lX_batch, ly_batch + + @staticmethod + def define_pad_length(index_list): + lengths = [len(index) for index in index_list] + return int(np.mean(lengths) + np.std(lengths)) + + @staticmethod + def pad(index_list, pad_index, max_pad_length=None): + pad_length = np.max([len(index) for index in index_list]) + if max_pad_length is not None: + pad_length = min(pad_length, max_pad_length) + for i, indexes in enumerate(index_list): + index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length] + return index_list + + +class RecurrentDataModule(pl.LightningDataModule): + """ + Pytorch Lightning Datamodule to be deployed with RecurrentGen. + https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + """ + def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1): + """ + Init RecurrentDataModule. + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param batchsize: int, number of sample per batch. + :param n_jobs: int, number of concurrent workers to be deployed (i.e., parallelizing data loading). + """ + self.multilingualIndex = multilingualIndex + self.batchsize = batchsize + self.n_jobs = n_jobs + super().__init__() + + def prepare_data(self, *args, **kwargs): + pass + + def setup(self, stage=None): + if stage == 'fit' or stage is None: + l_train_index, l_train_target = self.multilingualIndex.l_train() + # Debug settings: reducing number of samples + # l_train_index = {l: train[:5] for l, train in l_train_index.items()} + # l_train_target = {l: target[:5] for l, target in l_train_target.items()} + + self.training_dataset = RecurrentDataset(l_train_index, l_train_target, + lPad_index=self.multilingualIndex.l_pad()) + + l_val_index, l_val_target = self.multilingualIndex.l_val() + # Debug settings: reducing number of samples + # l_val_index = {l: train[:5] for l, train in l_val_index.items()} + # l_val_target = {l: target[:5] for l, target in l_val_target.items()} + + self.val_dataset = RecurrentDataset(l_val_index, l_val_target, + lPad_index=self.multilingualIndex.l_pad()) + if stage == 'test' or stage is None: + l_test_index, l_test_target = self.multilingualIndex.l_test() + # Debug settings: reducing number of samples + # l_test_index = {l: train[:5] for l, train in l_test_index.items()} + # l_test_target = {l: target[:5] for l, target in l_test_target.items()} + + self.test_dataset = RecurrentDataset(l_test_index, l_test_target, + lPad_index=self.multilingualIndex.l_pad()) + + def train_dataloader(self): + return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + collate_fn=self.training_dataset.collate_fn) + + def val_dataloader(self): + return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + collate_fn=self.val_dataset.collate_fn) + + def test_dataloader(self): + return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + collate_fn=self.test_dataset.collate_fn) + + +def tokenize(l_raw, max_len): + """ + run Bert tokenization on dict {lang: list of samples}. + :param l_raw: + :param max_len: + :return: + """ + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + l_tokenized = {} + for lang in l_raw.keys(): + output_tokenizer = tokenizer(l_raw[lang], truncation=True, max_length=max_len, padding='max_length') + l_tokenized[lang] = output_tokenizer['input_ids'] + return l_tokenized + + +class BertDataModule(RecurrentDataModule): + """ + Pytorch Lightning Datamodule to be deployed with BertGen. + https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + """ + def __init__(self, multilingualIndex, batchsize=64, max_len=512): + """ + Init BertDataModule. + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param batchsize: int, number of sample per batch. + :param max_len: int, max number of token per document. Absolute cap is 512. + """ + super().__init__(multilingualIndex, batchsize) + self.max_len = max_len + + def setup(self, stage=None): + if stage == 'fit' or stage is None: + l_train_raw, l_train_target = self.multilingualIndex.l_train_raw() + # Debug settings: reducing number of samples + # l_train_raw = {l: train[:5] for l, train in l_train_raw.items()} + # l_train_target = {l: target[:5] for l, target in l_train_target.items()} + + l_train_index = tokenize(l_train_raw, max_len=self.max_len) + self.training_dataset = RecurrentDataset(l_train_index, l_train_target, + lPad_index=self.multilingualIndex.l_pad()) + + l_val_raw, l_val_target = self.multilingualIndex.l_val_raw() + # Debug settings: reducing number of samples + # l_val_raw = {l: train[:5] for l, train in l_val_raw.items()} + # l_val_target = {l: target[:5] for l, target in l_val_target.items()} + + l_val_index = tokenize(l_val_raw, max_len=self.max_len) + self.val_dataset = RecurrentDataset(l_val_index, l_val_target, + lPad_index=self.multilingualIndex.l_pad()) + + if stage == 'test' or stage is None: + l_test_raw, l_test_target = self.multilingualIndex.l_test_raw() + # Debug settings: reducing number of samples + # l_test_raw = {l: train[:5] for l, train in l_test_raw.items()} + # l_test_target = {l: target[:5] for l, target in l_test_target.items()} + + l_test_index = tokenize(l_test_raw, max_len=self.max_len) + self.test_dataset = RecurrentDataset(l_test_index, l_test_target, + lPad_index=self.multilingualIndex.l_pad()) + + def train_dataloader(self): + """ + NB: Setting n_workers to > 0 will cause "OSError: [Errno 24] Too many open files" + :return: + """ + return DataLoader(self.training_dataset, batch_size=self.batchsize) + + def val_dataloader(self): + return DataLoader(self.val_dataset, batch_size=self.batchsize) + + def test_dataloader(self): + return DataLoader(self.test_dataset, batch_size=self.batchsize) diff --git a/src/dataset_builder.py b/src/data/dataset_builder.py similarity index 99% rename from src/dataset_builder.py rename to src/data/dataset_builder.py index b9650c7..90760cb 100644 --- a/src/dataset_builder.py +++ b/src/data/dataset_builder.py @@ -1,19 +1,20 @@ -from os.path import join, exists -from nltk.corpus import stopwords -from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from sklearn.preprocessing import MultiLabelBinarizer -from data.reader.jrcacquis_reader import * -from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING -from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy -from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents -import pickle -import numpy as np -from sklearn.model_selection import train_test_split -from scipy.sparse import issparse import itertools -from tqdm import tqdm import re +from os.path import exists + +import numpy as np +from nltk.corpus import stopwords from scipy.sparse import csr_matrix +from scipy.sparse import issparse +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MultiLabelBinarizer +from tqdm import tqdm + +from src.data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING +from src.data.reader.jrcacquis_reader import * +from src.data.reader.rcv_reader import fetch_RCV1, fetch_RCV2 +from src.data.text_preprocessor import NLTKStemTokenizer, preprocess_documents class MultilingualDataset: diff --git a/src/data/reader/jrcacquis_reader.py b/src/data/reader/jrcacquis_reader.py index c0441ed..e1e3bc2 100644 --- a/src/data/reader/jrcacquis_reader.py +++ b/src/data/reader/jrcacquis_reader.py @@ -1,19 +1,22 @@ from __future__ import print_function -import os, sys -from os.path import join + +import os +import pickle +import sys import tarfile import xml.etree.ElementTree as ET -from sklearn.datasets import get_data_home -import pickle -from util.file import download_file, list_dirs, list_files +import zipfile +from collections import Counter +from os.path import join +from random import shuffle + import rdflib from rdflib.namespace import RDF, SKOS -from rdflib import URIRef -import zipfile -from data.languages import JRC_LANGS -from collections import Counter -from random import shuffle -from data.languages import lang_set +from sklearn.datasets import get_data_home + +from src.data.languages import JRC_LANGS +from src.data.languages import lang_set +from src.util.file import download_file, list_dirs, list_files """ JRC Acquis' Nomenclature: diff --git a/src/data/reader/rcv_reader.py b/src/data/reader/rcv_reader.py index cd4b416..dc2462e 100644 --- a/src/data/reader/rcv_reader.py +++ b/src/data/reader/rcv_reader.py @@ -1,15 +1,12 @@ -from zipfile import ZipFile -import xml.etree.ElementTree as ET -from data.languages import RCV2_LANGS_WITH_NLTK_STEMMING, RCV2_LANGS -from util.file import list_files -from sklearn.datasets import get_data_home -import gzip -from os.path import join, exists -from util.file import download_file_if_not_exists import re -from collections import Counter +import xml.etree.ElementTree as ET +from os.path import join, exists +from zipfile import ZipFile + import numpy as np -import sys + +from src.util.file import download_file_if_not_exists +from src.util.file import list_files """ RCV2's Nomenclature: diff --git a/src/data/reader/wikipedia_tools.py b/src/data/reader/wikipedia_tools.py index 83e11e3..6ae89ff 100644 --- a/src/data/reader/wikipedia_tools.py +++ b/src/data/reader/wikipedia_tools.py @@ -1,15 +1,17 @@ from __future__ import print_function + # import ijson # from ijson.common import ObjectBuilder -import os, sys -from os.path import join -from bz2 import BZ2File +import os import pickle -from util.file import list_dirs, list_files, makedirs_if_not_exist -from itertools import islice import re +from bz2 import BZ2File +from itertools import islice +from os.path import join from xml.sax.saxutils import escape + import numpy as np +from util.file import list_dirs, list_files policies = ["IN_ALL_LANGS", "IN_ANY_LANG"] diff --git a/src/data/text_preprocessor.py b/src/data/text_preprocessor.py index 1a6e3ae..183df56 100644 --- a/src/data/text_preprocessor.py +++ b/src/data/text_preprocessor.py @@ -1,8 +1,9 @@ -from nltk.corpus import stopwords -from data.languages import NLTK_LANGMAP from nltk import word_tokenize +from nltk.corpus import stopwords from nltk.stem import SnowballStemmer +from src.data.languages import NLTK_LANGMAP + def preprocess_documents(documents, lang): tokens = NLTKStemTokenizer(lang, verbose=True) diff --git a/src/data/tsr_function__.py b/src/data/tsr_function__.py index 0af8690..c458029 100755 --- a/src/data/tsr_function__.py +++ b/src/data/tsr_function__.py @@ -1,8 +1,9 @@ import math + import numpy as np -from scipy.stats import t from joblib import Parallel, delayed from scipy.sparse import csr_matrix, csc_matrix +from scipy.stats import t def get_probs(tpr, fpr, pc): diff --git a/src/embeddings/__init__.py b/src/embeddings/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/embeddings/embeddings.py b/src/embeddings/embeddings.py deleted file mode 100644 index 27367e9..0000000 --- a/src/embeddings/embeddings.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -from torchtext.vocab import Vectors -import torch -from abc import ABC, abstractmethod -from util.SIF_embed import * - - -class PretrainedEmbeddings(ABC): - - def __init__(self): - super().__init__() - - @abstractmethod - def vocabulary(self): pass - - @abstractmethod - def dim(self): pass - - @classmethod - def reindex(cls, words, word2index): - if isinstance(words, dict): - words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0] - - source_idx, target_idx = [], [] - for i, word in enumerate(words): - if word not in word2index: continue - j = word2index[word] - source_idx.append(i) - target_idx.append(j) - source_idx = np.asarray(source_idx) - target_idx = np.asarray(target_idx) - return source_idx, target_idx - - -class FastTextWikiNews(Vectors): - - url_base = 'Cant auto-download MUSE embeddings' - path = '../embeddings/wiki.multi.{}.vec' - _name = '/wiki.multi.{}.vec' - - def __init__(self, cache, language="en", **kwargs): - url = self.url_base.format(language) - name = cache + self._name.format(language) - super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) - - -class FastTextMUSE(PretrainedEmbeddings): - def __init__(self, path, lang, limit=None): - super().__init__() - assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') - self.embed = FastTextWikiNews(path, lang, max_vectors=limit) - - def vocabulary(self): - return set(self.embed.stoi.keys()) - - def dim(self): - return self.embed.dim - - def extract(self, words): - source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) - extraction = torch.zeros((len(words), self.dim())) - extraction[source_idx] = self.embed.vectors[target_idx] - return extraction - - - diff --git a/src/embeddings/pretrained.py b/src/embeddings/pretrained.py deleted file mode 100644 index 026823e..0000000 --- a/src/embeddings/pretrained.py +++ /dev/null @@ -1,102 +0,0 @@ -from abc import ABC, abstractmethod -import torch, torchtext -# import gensim -# import os -import numpy as np - - -# class KeyedVectors: -# -# def __init__(self, word2index, weights): -# assert len(word2index)==weights.shape[0], 'wrong number of dimensions' -# index2word = {i:w for w,i in word2index.items()} -# assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed' -# self.word2index = word2index -# self.index2word = index2word -# self.weights = weights -# -# def extract(self, words): -# dim = self.weights.shape[1] -# v_size = len(words) -# -# source_idx, target_idx = [], [] -# for i,word in enumerate(words): -# if word not in self.word2index: continue -# j = self.word2index[word] -# source_idx.append(i) -# target_idx.append(j) -# -# extraction = np.zeros((v_size, dim)) -# extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)] -# -# return extraction - - -# class PretrainedEmbeddings(ABC): -# -# def __init__(self): -# super().__init__() -# -# @abstractmethod -# def vocabulary(self): pass -# -# @abstractmethod -# def dim(self): pass -# -# @classmethod -# def reindex(cls, words, word2index): -# source_idx, target_idx = [], [] -# for i, word in enumerate(words): -# if word not in word2index: continue -# j = word2index[word] -# source_idx.append(i) -# target_idx.append(j) -# source_idx = np.asarray(source_idx) -# target_idx = np.asarray(target_idx) -# return source_idx, target_idx - - -# class GloVe(PretrainedEmbeddings): -# -# def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None): -# super().__init__() -# print(f'Loading GloVe pretrained vectors from torchtext') -# self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors) -# print('Done') -# -# def vocabulary(self): -# return set(self.embed.stoi.keys()) -# -# def dim(self): -# return self.embed.dim -# -# def extract(self, words): -# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) -# extraction = torch.zeros((len(words), self.dim())) -# extraction[source_idx] = self.embed.vectors[target_idx] -# return extraction - - -# class Word2Vec(PretrainedEmbeddings): -# -# def __init__(self, path, limit=None): -# super().__init__() -# print(f'Loading word2vec pretrained vectors from {path}') -# assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}') -# self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit) -# self.word2index={w:i for i,w in enumerate(self.embed.index2word)} -# print('Done') -# -# def vocabulary(self): -# return set(self.word2index.keys()) -# -# def dim(self): -# return self.embed.vector_size -# -# def extract(self, words): -# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index) -# extraction = np.zeros((len(words), self.dim())) -# extraction[source_idx] = self.embed.vectors[target_idx] -# extraction = torch.from_numpy(extraction).float() -# return extraction - diff --git a/src/embeddings/supervised.py b/src/embeddings/supervised.py deleted file mode 100755 index f84793e..0000000 --- a/src/embeddings/supervised.py +++ /dev/null @@ -1,74 +0,0 @@ -from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square -import numpy as np - - -def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur - std = np.clip(np.std(x, ddof=1, axis=axis), 1e-5, None) - mean = np.mean(x, axis=axis) - return (x - mean) / std - - -def supervised_embeddings_tfidf(X,Y): - tfidf_norm = X.sum(axis=0) - tfidf_norm[tfidf_norm==0] = 1 - F = (X.T).dot(Y) / tfidf_norm.T - return F - - -def supervised_embeddings_ppmi(X,Y): - Xbin = X>0 - D = X.shape[0] - Pxy = (Xbin.T).dot(Y)/D - Px = Xbin.sum(axis=0)/D - Py = Y.sum(axis=0)/D - F = np.asarray(Pxy/(Px.T*Py)) - F = np.maximum(F, 1.0) - F = np.log(F) - return F - - -def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=25000): - D = X.shape[0] - if D>max_documents: - print(f'sampling {max_documents}') - random_sample = np.random.permutation(D)[:max_documents] - X = X[random_sample] - Y = Y[random_sample] - cell_matrix = get_supervised_matrix(X, Y) - F = get_tsr_matrix(cell_matrix, tsr_score_funtion=tsr_function).T - return F - - -def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True): - if max_label_space != 0: - print('computing supervised embeddings...') - nC = Y.shape[1] - - if method=='ppmi': - F = supervised_embeddings_ppmi(X, Y) - elif method == 'dotn': - F = supervised_embeddings_tfidf(X, Y) - elif method == 'ig': - F = supervised_embeddings_tsr(X, Y, information_gain) - elif method == 'chi2': - F = supervised_embeddings_tsr(X, Y, chi_square) - - if dozscore: - F = zscores(F, axis=0) - - # Dumping F-matrix for further studies - dump_it = False - if dump_it: - with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile: - np.savetxt(outfile, F, delimiter='\t') - with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile: - for token in voc.keys(): - outfile.write(token+'\n') - - return F - - - - - - diff --git a/src/experiment_scripts/10run_dl_jrc.sh b/src/experiment_scripts/10run_dl_jrc.sh deleted file mode 100644 index ce04aa8..0000000 --- a/src/experiment_scripts/10run_dl_jrc.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run -logfile=../log/log10run_dl_jrc.csv - -runs='0 1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 -done \ No newline at end of file diff --git a/src/experiment_scripts/10run_dl_rcv.sh b/src/experiment_scripts/10run_dl_rcv.sh deleted file mode 100644 index 51ca64b..0000000 --- a/src/experiment_scripts/10run_dl_rcv.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -logfile=../log/log10run_dl_rcv.csv - -runs='0 1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 -done diff --git a/src/experiment_scripts/10run_jrc.sh b/src/experiment_scripts/10run_jrc.sh deleted file mode 100644 index 37e3333..0000000 --- a/src/experiment_scripts/10run_jrc.sh +++ /dev/null @@ -1,12 +0,0 @@ -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle -logfile=./results/10run_jrc_final_results.csv - -runs='0 1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2 - python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2 - python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2 - -done diff --git a/src/experiment_scripts/10run_jrc_combinations.sh b/src/experiment_scripts/10run_jrc_combinations.sh deleted file mode 100644 index 156a0a5..0000000 --- a/src/experiment_scripts/10run_jrc_combinations.sh +++ /dev/null @@ -1,16 +0,0 @@ -dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run -logfile=./results/funnelling_10run_jrc_CIKM.csv - -runs='6 7 8 9' #0 1 2 3 4 5 -for run in $runs -do - dataset=$dataset_path$run.pickle - #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated (done up to run5) - python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated - #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -c -P -U -r -z --l2 - #python main_gFun.py $dataset -o $logfile -c -P -U -S -r -z --l2 -done \ No newline at end of file diff --git a/src/experiment_scripts/10run_rcv.sh b/src/experiment_scripts/10run_rcv.sh deleted file mode 100644 index 9d49f94..0000000 --- a/src/experiment_scripts/10run_rcv.sh +++ /dev/null @@ -1,15 +0,0 @@ -dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -logfile=./results/10run_rcv_final_results.csv - -runs='0 1 2 3 4 5 6 7 8 9' - -for run in $runs -do - dataset=$dataset_path$run.pickle - python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2 - python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2 - python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2 - -done - - diff --git a/src/experiment_scripts/10run_rcv_combinations.sh b/src/experiment_scripts/10run_rcv_combinations.sh deleted file mode 100644 index b5d8a3b..0000000 --- a/src/experiment_scripts/10run_rcv_combinations.sh +++ /dev/null @@ -1,16 +0,0 @@ -dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv - -runs='0 1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated - python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated - #python main_gFun.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -c -P -U -r -z --l2 - #python main_gFun.py $dataset -o $logfile -c -P -U -S -r -z --l2 -done \ No newline at end of file diff --git a/src/experiment_scripts/extract_features.sh b/src/experiment_scripts/extract_features.sh deleted file mode 100644 index d0bd3ac..0000000 --- a/src/experiment_scripts/extract_features.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run# - -runs='1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - modelpath=/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run$runs - python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath -done - -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle -python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath \ No newline at end of file diff --git a/src/experiment_scripts/main_deep_learning.py b/src/experiment_scripts/main_deep_learning.py deleted file mode 100755 index ee56054..0000000 --- a/src/experiment_scripts/main_deep_learning.py +++ /dev/null @@ -1,329 +0,0 @@ -import argparse -import torch.nn as nn -from torch.optim.lr_scheduler import StepLR -from dataset_builder import MultilingualDataset -from learning.transformers import load_muse_embeddings -from models.lstm_class import RNNMultilingualClassifier -from util.csv_log import CSVLog -from util.early_stop import EarlyStopping -from util.common import * -from util.file import create_if_not_exist -from time import time -from tqdm import tqdm -from util.evaluation import evaluate -from util.file import get_file_name -# import pickle - -allowed_nets = {'rnn'} - -# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested -def init_Net(nC, multilingual_index, xavier_uniform=True): - net=opt.net - assert net in allowed_nets, f'{net} not supported, valid ones are={allowed_nets}' - - # instantiate the required net - if net=='rnn': - only_post = opt.posteriors and (not opt.pretrained) and (not opt.supervised) - if only_post: - print('working on ONLY POST mode') - model = RNNMultilingualClassifier( - output_size=nC, - hidden_size=opt.hidden, - lvocab_size=multilingual_index.l_vocabsize(), - learnable_length=opt.learnable, - lpretrained=multilingual_index.l_embeddings(), - drop_embedding_range=multilingual_index.sup_range, - drop_embedding_prop=opt.sup_drop, - post_probabilities=opt.posteriors, - only_post=only_post, - bert_embeddings=opt.mbert - ) - - # weight initialization - if xavier_uniform: - for p in model.parameters(): - if p.dim() > 1 and p.requires_grad: - nn.init.xavier_uniform_(p) - - if opt.tunable: - # this has to be performed *after* Xavier initialization is done, - # otherwise the pretrained embedding parameters will be overrided - model.finetune_pretrained() - - return model.cuda() - - -def set_method_name(): - method_name = f'{opt.net}(H{opt.hidden})' - if opt.pretrained: - method_name += f'-Muse' - if opt.supervised: - method_name += f'-WCE' - if opt.posteriors: - method_name += f'-Posteriors' - if opt.mbert: - method_name += f'-mBert' - if (opt.pretrained or opt.supervised) and opt.tunable: - method_name += '-(trainable)' - else: - method_name += '-(static)' - if opt.learnable > 0: - method_name += f'-Learnable{opt.learnable}' - return method_name - - -def init_optimizer(model, lr): - return torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=opt.weight_decay) - - -def init_logfile(method_name, opt): - logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) - logfile.set_default('dataset', opt.dataset) - logfile.set_default('run', opt.seed) - logfile.set_default('method', method_name) - assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \ - f'and run {opt.seed} already calculated' - return logfile - - -# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise -def load_pretrained_embeddings(we_path, langs): - lpretrained = lpretrained_vocabulary = none_dict(langs) - if opt.pretrained: - lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1) - lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} - return lpretrained, lpretrained_vocabulary - - -def get_lr(optimizer): - for param_group in optimizer.param_groups: - return param_group['lr'] - - -def train(model, batcher, ltrain_index, ltrain_posteriors, ltrain_bert, lytr, tinit, logfile, criterion, optim, epoch, method_name): - _dataset_path = opt.dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - - loss_history = [] - model.train() - for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)): - optim.zero_grad() - # _out = model(batch, post, bert_emb, lang) - loss = criterion(model(batch, post, bert_emb, lang), target) - loss.backward() - clip_gradient(model) - optim.step() - loss_history.append(loss.item()) - - if idx % opt.log_interval == 0: - interval_loss = np.mean(loss_history[-opt.log_interval:]) - print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') - - mean_loss = np.mean(interval_loss) - logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) - return mean_loss - - -def test(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix): - - loss_history = [] - model.eval() - langs = sorted(ltest_index.keys()) - predictions = {l:[] for l in langs} - yte_stacked = {l:[] for l in langs} - batcher.init_offset() - for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), desc='evaluation: '): - logits = model(batch, post, bert_emb, lang) - loss = criterion(logits, target).item() - prediction = predict(logits) - predictions[lang].append(prediction) - yte_stacked[lang].append(target.detach().cpu().numpy()) - loss_history.append(loss) - - ly = {l:np.vstack(yte_stacked[l]) for l in langs} - ly_ = {l:np.vstack(predictions[l]) for l in langs} - l_eval = evaluate(ly, ly_) - metrics = [] - for lang in langs: - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if measure_prefix == 'te': - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) - print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - - mean_loss = np.mean(loss_history) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) - - return Mf1 - - -# ---------------------------------------------------------------------------------------------------------------------- -def main(): - DEBUGGING = False - - method_name = set_method_name() - logfile = init_logfile(method_name, opt) - - # Loading the dataset - data = MultilingualDataset.load(opt.dataset) - # data.set_view(languages=['it', 'fr']) # Testing with less langs - data.show_dimensions() - langs = data.langs() - l_devel_raw, l_devel_target = data.training(target_as_csr=True) - l_test_raw, l_test_target = data.test(target_as_csr=True) - - # Loading the MUSE pretrained embeddings (only if requested) - lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs) - # lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set - - # Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs - multilingual_index = MultilingualIndex() - multilingual_index.index(l_devel_raw, l_devel_target, l_test_raw, lpretrained_vocabulary) - multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed) - multilingual_index.embedding_matrices(lpretrained, opt.supervised) - if opt.posteriors: - if DEBUGGING: - import pickle - with open('/home/andreapdr/funneling_pdr/dumps/posteriors_jrc_run0.pickle', 'rb') as infile: - data_post = pickle.load(infile) - lPtr = data_post[0] - lPva = data_post[1] - lPte = data_post[2] - print('## DEBUGGING MODE: loaded dumped posteriors for jrc run0') - else: - lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000) - else: - lPtr, lPva, lPte = None, None, None - - if opt.mbert: - _dataset_path = opt.dataset.split('/')[-1].split('_') - _model_folder = _dataset_path[0] + '_' + _dataset_path[-1].replace('.pickle', '') - # print(f'Model Folder: {_model_folder}') - - if DEBUGGING: - with open('/home/andreapdr/funneling_pdr/dumps/mBert_jrc_run0.pickle', 'rb') as infile: - data_embed = pickle.load(infile) - tr_bert_embeddings = data_embed[0] - va_bert_embeddings = data_embed[1] - te_bert_embeddings = data_embed[2] - print('## DEBUGGING MODE: loaded dumped mBert embeddings for jrc run0') - else: - tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings \ - = multilingual_index.bert_embeddings(f'/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-{_model_folder}/') - else: - tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings = None, None, None - - # Model initialization - model = init_Net(data.num_categories(), multilingual_index) - - optim = init_optimizer(model, lr=opt.lr) - criterion = torch.nn.BCEWithLogitsLoss().cuda() - lr_scheduler = StepLR(optim, step_size=25, gamma=0.5) - batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad()) - batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad()) - - tinit = time() - create_if_not_exist(opt.checkpoint_dir) - early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, - checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}') - - l_train_index, l_train_target = multilingual_index.l_train() - l_val_index, l_val_target = multilingual_index.l_val() - l_test_index = multilingual_index.l_test_index() - - print('-'*80) - print('Start training') - for epoch in range(1, opt.nepochs + 1): - train(model, batcher_train, l_train_index, lPtr, tr_bert_embeddings, l_train_target, tinit, logfile, criterion, optim, epoch, method_name) - lr_scheduler.step() # reduces the learning rate - - # validation - macrof1 = test(model, batcher_eval, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, epoch, logfile, criterion, 'va') - early_stop(macrof1, epoch) - if opt.test_each>0: - if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch0: - print(f'running last {opt.val_epochs} training epochs on the validation set') - for val_epoch in range(1, opt.val_epochs + 1): - batcher_train.init_offset() - train(model, batcher_train, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name) - - # final test - print('Training complete: testing') - test(model, batcher_eval, l_test_index, lPte, te_bert_embeddings, l_test_target, tinit, epoch, logfile, criterion, 'te') - - -# ---------------------------------------------------------------------------------------------------------------------- -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings') - parser.add_argument('dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset') - parser.add_argument('--batch-size', type=int, default=50, metavar='int', help='input batch size (default: 100)') - parser.add_argument('--batch-size-test', type=int, default=250, metavar='int', help='batch size for testing (default: 250)') - parser.add_argument('--nepochs', type=int, default=200, metavar='int', help='number of epochs (default: 200)') - parser.add_argument('--patience', type=int, default=10, metavar='int', help='patience for early-stop (default: 10)') - parser.add_argument('--plotmode', action='store_true', default=False, help='in plot mode executes a long run in order ' - 'to generate enough data to produce trend plots (test-each should be >0. This mode is ' - 'used to produce plots, and does not perform an evaluation on the test set.') - parser.add_argument('--hidden', type=int, default=512, metavar='int', help='hidden lstm size (default: 512)') - parser.add_argument('--lr', type=float, default=1e-3, metavar='float', help='learning rate (default: 1e-3)') - parser.add_argument('--weight_decay', type=float, default=0, metavar='float', help='weight decay (default: 0)') - parser.add_argument('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', help='dropout probability for the supervised matrix (default: 0.5)') - parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') - parser.add_argument('--svm-max-docs', type=int, default=1000, metavar='int', help='maximum number of documents by ' - 'language used to train the calibrated SVMs (only used if --posteriors is active)') - parser.add_argument('--log-interval', type=int, default=10, metavar='int', help='how many batches to wait before printing training status') - parser.add_argument('--log-file', type=str, default='../log/log.csv', metavar='str', help='path to the log csv file') - parser.add_argument('--test-each', type=int, default=0, metavar='int', help='how many epochs to wait before invoking test (default: 0, only at the end)') - parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', help='path to the directory containing checkpoints') - parser.add_argument('--net', type=str, default='rnn', metavar='str', help=f'net, one in {allowed_nets}') - parser.add_argument('--pretrained', action='store_true', default=False, help='use MUSE pretrained embeddings') - parser.add_argument('--supervised', action='store_true', default=False, help='use supervised embeddings') - parser.add_argument('--posteriors', action='store_true', default=False, help='concatenate posterior probabilities to doc embeddings') - parser.add_argument('--learnable', type=int, default=0, metavar='int', help='dimension of the learnable embeddings (default 0)') - parser.add_argument('--val-epochs', type=int, default=1, metavar='int', help='number of training epochs to perform on the ' - 'validation set once training is over (default 1)') - parser.add_argument('--we-path', type=str, default='../embeddings', metavar='str', - help=f'path to MUSE pretrained embeddings') - parser.add_argument('--max-label-space', type=int, default=300, metavar='int', help='larger dimension allowed for the ' - 'feature-label embedding (if larger, then PCA with this number of components is applied ' - '(default 300)') - parser.add_argument('--force', action='store_true', default=False, help='do not check if this experiment has already been run') - parser.add_argument('--tunable', action='store_true', default=False, - help='pretrained embeddings are tunable from the beginning (default False, i.e., static)') - parser.add_argument('--mbert', action='store_true', default=False, - help='use mBert embeddings') - - opt = parser.parse_args() - - assert torch.cuda.is_available(), 'CUDA not available' - assert not opt.plotmode or opt.test_each > 0, 'plot mode implies --test-each>0' - # if opt.pickle_dir: opt.pickle_path = join(opt.pickle_dir, f'{opt.dataset}.pickle') - torch.manual_seed(opt.seed) - - main() diff --git a/src/experiment_scripts/main_embeddings_cls.py b/src/experiment_scripts/main_embeddings_cls.py deleted file mode 100644 index 08552d3..0000000 --- a/src/experiment_scripts/main_embeddings_cls.py +++ /dev/null @@ -1,127 +0,0 @@ -import os -from dataset_builder import MultilingualDataset -from util.evaluation import * -from optparse import OptionParser -from util.file import exists -from util.results import PolylingualClassificationResults -from util.util import get_learner, get_params - -parser = OptionParser() - -parser.add_option("-d", "--dataset", dest="dataset", - help="Path to the multilingual dataset processed and stored in .pickle format", - default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") - -parser.add_option("-o", "--output", dest="output", - help="Result file", type=str, default='./results/results.csv') - -parser.add_option("-e", "--mode-embed", dest="mode_embed", - help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none') - -parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/') - -parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str, - default='MUSE') - -parser.add_option("-s", "--set_c", dest="set_c",type=float, - help="Set the C parameter", default=1) - -parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimize hyperparameters", default=False) - -parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, - help="Number of parallel jobs (default is -1, all)", default=-1) - -parser.add_option("-p", "--pca", dest="max_labels_S", type=int, - help="If smaller than number of target classes, PCA will be applied to supervised matrix. " - "If set to 0 it will automatically search for the best number of components. " - "If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)", - default=300) - -parser.add_option("-u", "--upca", dest="max_labels_U", type=int, - help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." - " If set to 0 it will automatically search for the best number of components", default=300) - -parser.add_option("-l", dest="lang", type=str) - -if __name__ == '__main__': - (op, args) = parser.parse_args() - - assert exists(op.dataset), 'Unable to find file '+str(op.dataset) - assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' - - dataset_file = os.path.basename(op.dataset) - - results = PolylingualClassificationResults('./results/PLE_results.csv') - - data = MultilingualDataset.load(op.dataset) - data.show_dimensions() - - # data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10))) - # data.set_view(languages=[op.lang]) - # data.set_view(categories=list(range(10))) - lXtr, lytr = data.training() - lXte, lyte = data.test() - - if op.set_c != -1: - meta_parameters = None - else: - meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}] - - # Embeddings and WCE config - _available_mode = ['none', 'unsupervised', 'supervised', 'both'] - _available_type = ['MUSE', 'FastText'] - assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}' - assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}' - - if op.mode_embed == 'none': - config = {'unsupervised': False, - 'supervised': False, - 'we_type': None} - _config_id = 'None' - elif op.mode_embed == 'unsupervised': - config = {'unsupervised': True, - 'supervised': False, - 'we_type': op.we_type} - _config_id = 'M' - elif op.mode_embed == 'supervised': - config = {'unsupervised': False, - 'supervised': True, - 'we_type': None} - _config_id = 'F' - elif op.mode_embed == 'both': - config = {'unsupervised': True, - 'supervised': True, - 'we_type': op.we_type} - _config_id = 'M+F' - - config['reduction'] = 'PCA' - config['max_label_space'] = op.max_labels_S - config['dim_reduction_unsupervised'] = op.max_labels_U - # config['post_pca'] = op.post_pca - # config['plot_covariance_matrices'] = True - - result_id = dataset_file + 'MLE_andrea' + _config_id + ('_optimC' if op.optimc else '') - - ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/', - config = config, - learner=get_learner(calibrate=False), - c_parameters=get_params(dense=False), - n_jobs=op.n_jobs) - - print('# Fitting ...') - ple.fit(lXtr, lytr) - - print('# Evaluating ...') - ple_eval = evaluate_method(ple, lXte, lyte) - - metrics = [] - for lang in lXte.keys(): - macrof1, microf1, macrok, microk = ple_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - results.add_row('MLE', 'svm', _config_id, config['we_type'], - 'no','no', op.optimc, op.dataset.split('/')[-1], ple.time, - lang, macrof1, microf1, macrok, microk, '') - print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/experiment_scripts/main_majorityvoting_cls.py b/src/experiment_scripts/main_majorityvoting_cls.py deleted file mode 100644 index ee5efe5..0000000 --- a/src/experiment_scripts/main_majorityvoting_cls.py +++ /dev/null @@ -1,155 +0,0 @@ -import os -from dataset_builder import MultilingualDataset -# from learning.learners import * -# from learning.learners import FunnellingMultimodal -from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting -from util.evaluation import * -from optparse import OptionParser -from util.file import exists -from util.results import PolylingualClassificationResults -from sklearn.svm import SVC - -parser = OptionParser() - -# parser.add_option("-d", "--dataset", dest="dataset", -# help="Path to the multilingual dataset processed and stored in .pickle format", -# default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") - -parser.add_option("-o", "--output", dest="output", - help="Result file", type=str, default='./results/results.csv') - -parser.add_option("-P", "--probs", dest="posteriors", action='store_true', - help="Add posterior probabilities to the document embedding representation", default=False) - -parser.add_option("-S", "--supervised", dest="supervised", action='store_true', - help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False) - -parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true', - help="Add pretrained MUSE embeddings to the document embedding representation", default=False) - -parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the MUSE polylingual word embeddings", default='../embeddings') - -parser.add_option("-s", "--set_c", dest="set_c",type=float, - help="Set the C parameter", default=1) - -parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimize hyperparameters", default=False) - -parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, - help="Number of parallel jobs (default is -1, all)", default=-1) - -parser.add_option("-p", "--pca", dest="max_labels_S", type=int, - help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", - default=300) - -parser.add_option("-r", "--remove-pc", dest="sif", action='store_true', - help="Remove common component when computing dot product of word embedding matrices", default=False) - -# parser.add_option("-u", "--upca", dest="max_labels_U", type=int, -# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." -# " If set to 0 it will automatically search for the best number of components", default=300) - -# parser.add_option("-a", dest="post_pca", -# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " -# "embedding space", default=False) - - -def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto') - - -def get_params(dense=False): - if not op.optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' if dense else 'linear' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - -####################################################################################################################### - - -if __name__ == '__main__': - (op, args) = parser.parse_args() - - assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)' - dataset = args[0] - - assert exists(dataset), 'Unable to find file '+str(dataset) - assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' - assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed' - - dataset_file = os.path.basename(dataset) - - results = PolylingualClassificationResults(op.output) - - data = MultilingualDataset.load(dataset) - data.show_dimensions() - - lXtr, lytr = data.training() - lXte, lyte = data.test() - - meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] - - # result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}' - result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \ - f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}' - print(f'{result_id}') - - # text preprocessing - tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - lXtr = tfidfvectorizer.fit_transform(lXtr, lytr) - lXte = tfidfvectorizer.transform(lXte) - lV = tfidfvectorizer.vocabulary() - - classifiers = [] - if op.posteriors: - classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None)) - if op.supervised: - classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S))) - if op.pretrained: - classifiers.append(FeatureSet2Posteriors(MuseEmbedder(op.we_path, lV=lV))) - - classifier = Voting(*classifiers) - - print('# Fitting ...') - classifier.fit(lXtr, lytr) - - print('\n# Evaluating ...') - l_eval = evaluate_method(classifier, lXte, lyte) - - # renaming arguments to be printed on log - _id = '' - _id_conf = [op.posteriors, op.supervised, op.pretrained] - _id_name = ['+P', '+W', '+M'] - for i, conf in enumerate(_id_conf): - if conf: - _id += _id_name[i] - _id = _id.lstrip('+') - _dataset_path = dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - - metrics = [] - for lang in lXte.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - results.add_row(method='Voting', - learner='svm', - optimp=op.optimc, - sif=op.sif, - zscore='todo', - l2='todo', - wescaler='todo', - pca=op.max_labels_S, - id=_id, - dataset=dataset_id, - time='todo', - lang=lang, - macrof1=macrof1, - microf1=microf1, - macrok=macrok, - microk=microk, - notes='') - print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/experiment_scripts/main_mbert.py b/src/experiment_scripts/main_mbert.py deleted file mode 100644 index aa44407..0000000 --- a/src/experiment_scripts/main_mbert.py +++ /dev/null @@ -1,390 +0,0 @@ -from dataset_builder import MultilingualDataset -from transformers import BertTokenizer, BertForSequenceClassification, AdamW -from torch.utils.data import Dataset, DataLoader -import numpy as np -import torch -from util.common import predict -from time import time -from util.csv_log import CSVLog -from util.evaluation import evaluate -from util.early_stop import EarlyStopping -from torch.optim.lr_scheduler import StepLR -from sklearn.model_selection import train_test_split -from copy import deepcopy -import argparse -# from torch.utils.tensorboard import SummaryWriter - - -def check_sentences(sentences): - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - for sentence in sentences: - converted = [tokenizer._convert_id_to_token(token) for token in sentence.numpy() if token != 0] - print(converted) - return - - -def get_model(n_out): - print('# Initializing model ...') - model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out) - return model - - -def set_method_name(): - return 'mBERT' - - -def init_optimizer(model, lr): - # return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay) - no_decay = ['bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() - if not any(nd in n for nd in no_decay)], - 'weight_decay': opt.weight_decay}, - {'params': [p for n, p in model.named_parameters() - if any(nd in n for nd in no_decay)], - 'weight_decay': opt.weight_decay} - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=lr) - return optimizer - - -def init_logfile(method_name, opt): - logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) - logfile.set_default('dataset', opt.dataset) - logfile.set_default('run', opt.seed) - logfile.set_default('method', method_name) - assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \ - f'and run {opt.seed} already calculated' - return logfile - - -def get_lr(optimizer): - for param_group in optimizer.param_groups: - return param_group['lr'] - - -def get_dataset_name(datapath): - possible_splits = [str(i) for i in range(10)] - splitted = datapath.split('_') - id_split = splitted[-1].split('.')[0][-1] - if id_split in possible_splits: - dataset_name = splitted[0].split('/')[-1] - return f'{dataset_name}_run{id_split}' - elif splitted[-2].split('.')[0] == 'full': - dataset_name = splitted[0].split('/')[-1] - return f'{dataset_name}_fullrun' - - -def load_datasets(datapath): - data = MultilingualDataset.load(datapath) - # data.set_view(languages=['it']) #, categories=[0, 1, 2, 3, 4]) # Testing with less langs - data.show_dimensions() - - l_devel_raw, l_devel_target = data.training(target_as_csr=False) - l_test_raw, l_test_target = data.test(target_as_csr=False) - - return l_devel_raw, l_devel_target, l_test_raw, l_test_target - - -def do_tokenization(l_dataset, max_len=512, verbose=True): - if verbose: - print('# Starting Tokenization ...') - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - langs = l_dataset.keys() - l_tokenized = {} - for lang in langs: - l_tokenized[lang] = tokenizer(l_dataset[lang], - truncation=True, - max_length=max_len, - padding='max_length') - return l_tokenized - - -class TrainingDataset(Dataset): - """ - data: dict of lang specific tokenized data - labels: dict of lang specific targets - """ - - def __init__(self, data, labels): - self.langs = data.keys() - self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} - - for i, lang in enumerate(self.langs): - _data = data[lang]['input_ids'] - _data = np.array(_data) - _labels = labels[lang] - _lang_value = np.full(len(_data), self.lang_ids[lang]) - - if i == 0: - self.data = _data - self.labels = _labels - self.lang_index = _lang_value - else: - self.data = np.vstack((self.data, _data)) - self.labels = np.vstack((self.labels, _labels)) - self.lang_index = np.concatenate((self.lang_index, _lang_value)) - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - x = self.data[idx] - y = self.labels[idx] - lang = self.lang_index[idx] - - return x, torch.tensor(y, dtype=torch.float), lang - - def get_lang_ids(self): - return self.lang_ids - - def get_nclasses(self): - if hasattr(self, 'labels'): - return len(self.labels[0]) - else: - print('Method called before init!') - - -def freeze_encoder(model): - for param in model.base_model.parameters(): - param.requires_grad = False - return model - - -def check_param_grad_status(model): - print('#' * 50) - print('Model paramater status:') - for name, child in model.named_children(): - trainable = False - for param in child.parameters(): - if param.requires_grad: - trainable = True - if not trainable: - print(f'{name} is frozen') - else: - print(f'{name} is not frozen') - print('#' * 50) - - -def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer): - _dataset_path = opt.dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - - loss_history = [] - model.train() - - for idx, (batch, target, lang_idx) in enumerate(train_dataloader): - optim.zero_grad() - out = model(batch.cuda()) - logits = out[0] - loss = criterion(logits, target.cuda()) - loss.backward() - # clip_gradient(model) - optim.step() - loss_history.append(loss.item()) - - if writer is not None: - _n_step = (epoch - 1) * (len(train_dataloader)) + idx - writer.add_scalar('Loss_step/Train', loss, _n_step) - - # Check tokenized sentences consistency - # check_sentences(batch.cpu()) - - if idx % opt.log_interval == 0: - interval_loss = np.mean(loss_history[-opt.log_interval:]) - print( - f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') - - mean_loss = np.mean(interval_loss) - logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) - return mean_loss - - -def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix, writer): - print('# Validating model ...') - loss_history = [] - model.eval() - langs = lang_ids.keys() - id_2_lang = {v: k for k, v in lang_ids.items()} - predictions = {l: [] for l in langs} - yte_stacked = {l: [] for l in langs} - - for batch, target, lang_idx in test_dataloader: - out = model(batch.cuda()) - logits = out[0] - loss = criterion(logits, target.cuda()).item() - prediction = predict(logits) - loss_history.append(loss) - - # Assigning prediction to dict in predictions and yte_stacked according to lang_idx - for i, pred in enumerate(prediction): - lang_pred = id_2_lang[lang_idx.numpy()[i]] - predictions[lang_pred].append(pred) - yte_stacked[lang_pred].append(target[i].detach().cpu().numpy()) - - ly = {l: np.vstack(yte_stacked[l]) for l in langs} - ly_ = {l: np.vstack(predictions[l]) for l in langs} - l_eval = evaluate(ly, ly_) - metrics = [] - for lang in langs: - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if measure_prefix == 'te': - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) - print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - if writer is not None: - writer.add_scalars('Eval Metrics', {'Mf1': Mf1, 'mF1': mF1, 'MK': MK, 'mk':mk}, epoch) - - mean_loss = np.mean(loss_history) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) - - return Mf1 - - -def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed): - l_split_va = deepcopy(l_tokenized_tr) - l_split_val_target = {l: [] for l in l_tokenized_tr.keys()} - l_split_tr = deepcopy(l_tokenized_tr) - l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()} - - for lang in l_tokenized_tr.keys(): - val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val)) - l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[ - lang] = \ - train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, - random_state=seed, shuffle=True) - - return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target - - -def main(): - print('Running main ...') - - DATAPATH = opt.dataset - MAX_LEN = 512 - method_name = set_method_name() - logfile = init_logfile(method_name, opt) - - l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH) - l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN) - - l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target, - val_prop=0.2, max_val=2000, - seed=opt.seed) - - l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN) - - tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) - va_dataset = TrainingDataset(l_split_va, l_split_val_target) - te_dataset = TrainingDataset(l_tokenized_te, l_test_target) - - tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True) - va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True) - te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False) - - - # Initializing model - nC = tr_dataset.get_nclasses() - model = get_model(nC) - model = model.cuda() - criterion = torch.nn.BCEWithLogitsLoss().cuda() - optim = init_optimizer(model, lr=opt.lr) - lr_scheduler = StepLR(optim, step_size=25, gamma=0.1) - early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, - checkpoint=f'/home/andreapdr/funneling_pdr/hug_checkpoint/{method_name}-{get_dataset_name(opt.dataset)}', - is_bert=True) - - # Freezing encoder - # model = freeze_encoder(model) - check_param_grad_status(model) - - # Tensorboard logger - # writer = SummaryWriter('../log/tensorboard_logs/') - - # Training loop - tinit = time() - lang_ids = va_dataset.lang_ids - for epoch in range(1, opt.nepochs + 1): - print('# Start Training ...') - train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer=None) - lr_scheduler.step() # reduces the learning rate - - # Validation - macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va', writer=None) - early_stop(macrof1, epoch) - if opt.test_each > 0: - if (opt.plotmode and (epoch == 1 or epoch % opt.test_each == 0)) or ( - not opt.plotmode and epoch % opt.test_each == 0 and epoch < opt.nepochs): - test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None) - - if early_stop.STOP: - print('[early-stop] STOP') - if not opt.plotmode: - break - - if not opt.plotmode: - print('-' * 80) - print('Training over. Performing final evaluation') - - model = early_stop.restore_checkpoint() - model = model.cuda() - - if opt.val_epochs > 0: - print(f'running last {opt.val_epochs} training epochs on the validation set') - for val_epoch in range(1, opt.val_epochs + 1): - train(model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile, writer=None) - - # final test - print('Training complete: testing') - test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None) - - # writer.flush() - # writer.close() - exit('Code Executed!') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model') - - parser.add_argument('--dataset', type=str, - default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle', - metavar='datasetpath', help=f'path to the pickled dataset') - parser.add_argument('--nepochs', type=int, default=200, metavar='int', - help='number of epochs (default: 200)') - parser.add_argument('--lr', type=float, default=2e-5, metavar='float', - help='learning rate (default: 2e-5)') - parser.add_argument('--weight_decay', type=float, default=0, metavar='float', - help='weight decay (default: 0)') - parser.add_argument('--patience', type=int, default=10, metavar='int', - help='patience for early-stop (default: 10)') - parser.add_argument('--log-interval', type=int, default=20, metavar='int', - help='how many batches to wait before printing training status') - parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str', - help='path to the log csv file') - parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') - parser.add_argument('--force', action='store_true', default=False, - help='do not check if this experiment has already been run') - parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', - help='path to the directory containing checkpoints') - parser.add_argument('--plotmode', action='store_true', default=False, - help='in plot mode executes a long run in order ' - 'to generate enough data to produce trend plots (test-each should be >0. This mode is ' - 'used to produce plots, and does not perform an evaluation on the test set.') - parser.add_argument('--test-each', type=int, default=0, metavar='int', - help='how many epochs to wait before invoking test (default: 0, only at the end)') - parser.add_argument('--val-epochs', type=int, default=1, metavar='int', - help='number of training epochs to perform on the validation set once training is over (default 1)') - opt = parser.parse_args() - - # Testing different parameters ... - opt.weight_decay = 0.01 - opt.lr = 1e-5 - opt.patience = 5 - - main() - # TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size diff --git a/src/experiment_scripts/main_mbert_extractor.py b/src/experiment_scripts/main_mbert_extractor.py deleted file mode 100644 index 16f09d3..0000000 --- a/src/experiment_scripts/main_mbert_extractor.py +++ /dev/null @@ -1,110 +0,0 @@ -from experiment_scripts.main_mbert import * -import pickle - - -class ExtractorDataset(Dataset): - """ - data: dict of lang specific tokenized data - labels: dict of lang specific targets - """ - - def __init__(self, data): - self.langs = data.keys() - self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} - - for i, lang in enumerate(self.langs): - _data = data[lang]['input_ids'] - _data = np.array(_data) - _lang_value = np.full(len(_data), self.lang_ids[lang]) - - if i == 0: - self.data = _data - self.lang_index = _lang_value - else: - self.data = np.vstack((self.data, _data)) - self.lang_index = np.concatenate((self.lang_index, _lang_value)) - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - x = self.data[idx] - lang = self.lang_index[idx] - - return x, lang - - def get_lang_ids(self): - return self.lang_ids - - -def feature_extractor(data, lang_ids, model_path='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0/'): - print('# Feature Extractor Mode...') - from transformers import BertConfig - config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, num_labels=300) - model = BertForSequenceClassification.from_pretrained(model_path, - config=config).cuda() - - """ - Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for - the output of each layer) of shape (batch_size, sequence_length, hidden_size) - """ - all_batch_embeddings = {} - id2lang = {v:k for k,v in lang_ids.items()} - with torch.no_grad(): - for batch, target, lang_idx in data: - out = model(batch.cuda()) - last_hidden_state = out[1][-1] - batch_embeddings = last_hidden_state[:, 0, :] - for i, l_idx in enumerate(lang_idx.numpy()): - if id2lang[l_idx] not in all_batch_embeddings.keys(): - all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy() - else: - all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], - batch_embeddings[i].detach().cpu().numpy())) - - return all_batch_embeddings, id2lang - - -def main(): - print('Running main ...') - print(f'Model path: {opt.modelpath}\nDataset path: {opt.dataset}') - DATAPATH = opt.dataset - MAX_LEN = 512 - - l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH) - l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN) - l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN) - - tr_dataset = TrainingDataset(l_tokenized_tr, l_devel_target) - tr_lang_ids = tr_dataset.lang_ids - - te_dataset = TrainingDataset(l_tokenized_te, l_test_target) - te_lang_ids = te_dataset.lang_ids - - tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc embeddings - te_dataloader = DataLoader(te_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc - - tr_all_batch_embeddings, id2lang_tr = feature_extractor(tr_dataloader, tr_lang_ids, opt.modelpath) # Extracting doc embed for devel - with open(f'{opt.modelpath}/TR_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile: - pickle.dump((tr_all_batch_embeddings, id2lang_tr), outfile) - - te_all_batch_embeddings, id2lang_te = feature_extractor(te_dataloader, te_lang_ids, opt.modelpath) # Extracting doc embed for test - with open(f'{opt.modelpath}/TE_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile: - pickle.dump((te_all_batch_embeddings, id2lang_te), outfile) - - exit('Extraction completed!') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='mBert model document embedding extractor') - - parser.add_argument('--dataset', type=str, - default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle', - metavar='datasetpath', help=f'path to the pickled dataset') - parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') - parser.add_argument('--modelpath', type=str, default='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0', - metavar='modelpath', help=f'path to pre-trained mBert model') - opt = parser.parse_args() - - main() - diff --git a/src/experiment_scripts/main_qualitative_analysis.py b/src/experiment_scripts/main_qualitative_analysis.py deleted file mode 100644 index aead994..0000000 --- a/src/experiment_scripts/main_qualitative_analysis.py +++ /dev/null @@ -1,49 +0,0 @@ -import os -from dataset_builder import MultilingualDataset -from optparse import OptionParser -from util.file import exists -import numpy as np -from sklearn.feature_extraction.text import CountVectorizer - -parser = OptionParser(usage="usage: %prog datapath [options]") - -(op, args) = parser.parse_args() -assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)' -dataset = args[0] -assert exists(dataset), 'Unable to find file '+str(dataset) - -dataset_file = os.path.basename(dataset) - -data = MultilingualDataset.load(dataset) -data.set_view(languages=['it']) -data.show_dimensions() -lXtr, lytr = data.training() -lXte, lyte = data.test() - -vect_lXtr = dict() -vectorizer = CountVectorizer() -vect_lXtr['it'] = vectorizer.fit_transform(lXtr['it']) -# print(type(vect_lXtr['it'])) - -corr = vect_lXtr['it'].T.dot(lytr['it']) -# print(corr.shape) -sum_correlated_class = corr.sum(axis=0) -print(len(sum_correlated_class)) -print(sum_correlated_class.max()) - - -w2idx = vectorizer.vocabulary_ -idx2w = {v:k for k,v in w2idx.items()} - -word_tot_corr = corr.sum(axis=1) -print(word_tot_corr.shape) -dict_word_tot_corr = {v:k for k,v in enumerate(word_tot_corr)} - -sorted_word_tot_corr = np.sort(word_tot_corr) -sorted_word_tot_corr = sorted_word_tot_corr[len(sorted_word_tot_corr)-200:] - -top_idx = [dict_word_tot_corr[k] for k in sorted_word_tot_corr] -print([idx2w[idx] for idx in top_idx]) -print([elem for elem in top_idx]) -print(corr[8709]) -print('Finished...') \ No newline at end of file diff --git a/src/experiment_scripts/run_combinations_jrc.sh b/src/experiment_scripts/run_combinations_jrc.sh deleted file mode 100644 index a4aabde..0000000 --- a/src/experiment_scripts/run_combinations_jrc.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle -logfile=./results/final_combinations_jrc.csv -#A.2: ensembling feature sets (combinations of posteriors, wce, muse): -# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc... -# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...) - -# aggregation=concatenation -#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 -#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 -#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 -#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 -# - -##FeatureSetToPosteriors (aggregation mean) -python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob - -##FeatureSetToPosteriors -#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob - -#MajorityVoting -#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r -#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r -#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r -#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r - - diff --git a/src/experiment_scripts/run_combinations_rcv.sh b/src/experiment_scripts/run_combinations_rcv.sh deleted file mode 100644 index 4e1acfb..0000000 --- a/src/experiment_scripts/run_combinations_rcv.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -logfile=./results/final_combinations_rcv.csv -#A.2: ensembling feature sets (combinations of posteriors, wce, muse): -# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc... -# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...) - -# aggregation=concatenation -#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 -#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 -#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 -#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 -# -##FeatureSetToPosteriors (aggregation mean) -python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob - -##FeatureSetToPosteriors -#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob - -#MajorityVoting -#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r -#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r -#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r -#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r \ No newline at end of file diff --git a/src/experiment_scripts/run_dl_jrc.sh b/src/experiment_scripts/run_dl_jrc.sh deleted file mode 100644 index 1d28e83..0000000 --- a/src/experiment_scripts/run_dl_jrc.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash - -logfile=../log/log_pre_jrc.csv -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle -python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20 \ No newline at end of file diff --git a/src/experiment_scripts/run_dl_rcv.sh b/src/experiment_scripts/run_dl_rcv.sh deleted file mode 100644 index 4782887..0000000 --- a/src/experiment_scripts/run_dl_rcv.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --supervised --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20 \ No newline at end of file diff --git a/src/experiment_scripts/run_fulljrc_dl.sh b/src/experiment_scripts/run_fulljrc_dl.sh deleted file mode 100644 index 4d5eeaa..0000000 --- a/src/experiment_scripts/run_fulljrc_dl.sh +++ /dev/null @@ -1,16 +0,0 @@ -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle -seeds='5' #2 3 4 5 6 7 8 9 10' -for seed in $seeds -do - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed - python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed --force - - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed - - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed --force - -done \ No newline at end of file diff --git a/src/experiment_scripts/run_fullrcv_dl.sh b/src/experiment_scripts/run_fullrcv_dl.sh deleted file mode 100644 index 5894aef..0000000 --- a/src/experiment_scripts/run_fullrcv_dl.sh +++ /dev/null @@ -1,20 +0,0 @@ -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle -seeds='1 ' #2 3 4 5' # 6 7 8 9 10' -for seed in $seeds -do - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed - python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200 - - - - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed - - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed - -# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed -# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200 - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed -done \ No newline at end of file diff --git a/src/experiment_scripts/run_fun_bert_jrc.sh b/src/experiment_scripts/run_fun_bert_jrc.sh deleted file mode 100644 index fc2e2c3..0000000 --- a/src/experiment_scripts/run_fun_bert_jrc.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run -#logfile=../log/log_FunBert_jrc.csv -# -#runs='0 1 2 3 4' -#for run in $runs -#do -# dataset=$dataset_path$run.pickle -# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile #--tunable -#done - -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle -logfile=../log/log_FunBert_fulljrc_static.csv - -python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile \ No newline at end of file diff --git a/src/experiment_scripts/run_fun_bert_rcv.sh b/src/experiment_scripts/run_fun_bert_rcv.sh deleted file mode 100644 index e27fe54..0000000 --- a/src/experiment_scripts/run_fun_bert_rcv.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -#logfile=../log/log_FunBert_rcv_static.csv -# -#runs='0 1 2 3 4' -#for run in $runs -#do -# dataset=$dataset_path$run.pickle -# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile -#done - -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle -logfile=../log/log_FunBert_fullrcv_static.csv - -python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile \ No newline at end of file diff --git a/src/experiment_scripts/run_mbert_jrc.sh b/src/experiment_scripts/run_mbert_jrc.sh deleted file mode 100644 index 08733a4..0000000 --- a/src/experiment_scripts/run_mbert_jrc.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run -#logfile=../log/log_mBert_jrc_NEW.csv -# -#runs='0 1 2 3 4' -#for run in $runs -#do -# dataset=$dataset_path$run.pickle -# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50 -#done - -logfile=../log/log_mBert_fulljrc.csv -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle -python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50 \ No newline at end of file diff --git a/src/experiment_scripts/run_mbert_rcv.sh b/src/experiment_scripts/run_mbert_rcv.sh deleted file mode 100644 index 66ffba1..0000000 --- a/src/experiment_scripts/run_mbert_rcv.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -#logfile=../log/log_mBert_rcv_NEW.csv -# -#runs='0 1 2 3 4' -#for run in $runs -#do -# dataset=$dataset_path$run.pickle -# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50 -#done - -logfile=../log/log_mBert_fullrcv.csv -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle -python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=30 --patience 3 \ No newline at end of file diff --git a/src/experiment_scripts/run_traditional_jrc.sh b/src/experiment_scripts/run_traditional_jrc.sh deleted file mode 100644 index 460c9e8..0000000 --- a/src/experiment_scripts/run_traditional_jrc.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle - -######################################## POSTERIORS - # Posteriors -python main_multimodal_cls.py $dataset -P # + zscore -python main_multimodal_cls.py $dataset -P -z # +l2norm -python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight - - -######################################### WCE - #WCE supervised -python main_multimodal_cls.py $dataset -S # + zscore -python main_multimodal_cls.py $dataset -S -z # +l2norm -python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight -python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA - -python main_multimodal_cls.py $dataset -S -z -p 250 --l2 # +feature weight + pca -python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 # + SIF - -python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight -python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig -python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig # + pca -python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig - - -python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi - -################################# MUSE - - # MUSE unsupervised -python main_multimodal_cls.py $dataset -U # + zscore -python main_multimodal_cls.py $dataset -U -z # +l2norm -python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight -python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA - -python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca -python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig - -python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi diff --git a/src/experiment_scripts/run_traditional_rcv.sh b/src/experiment_scripts/run_traditional_rcv.sh deleted file mode 100644 index 0dcfa2c..0000000 --- a/src/experiment_scripts/run_traditional_rcv.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle - -######################################## POSTERIORS - # Posteriors -python main_multimodal_cls.py $dataset -P # + zscore -python main_multimodal_cls.py $dataset -P -z # +l2norm -python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight - - -######################################### WCE - #WCE supervised -python main_multimodal_cls.py $dataset -S # + zscore -python main_multimodal_cls.py $dataset -S -z # +l2norm -python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight -python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA - -python main_multimodal_cls.py $dataset -S -z -p 50 --l2 # +feature weight + pca -python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 # + SIF - -python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight -python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig -python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig # + pca -python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig - - -python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi - -################################# MUSE - - # MUSE unsupervised -python main_multimodal_cls.py $dataset -U # + zscore -python main_multimodal_cls.py $dataset -U -z # +l2norm -python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight -python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA - -python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca -python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig - -python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi diff --git a/src/experiment_scripts/time_comparison.sh b/src/experiment_scripts/time_comparison.sh deleted file mode 100644 index 60e1c25..0000000 --- a/src/experiment_scripts/time_comparison.sh +++ /dev/null @@ -1,6 +0,0 @@ -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle -seeds='1 2 3 4 5 6 7 8 9 10' -for seed in $seeds -do - python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised --nepochs 50 --seed $seed - done \ No newline at end of file diff --git a/src/funnelling.py b/src/funnelling.py new file mode 100644 index 0000000..ba2be1b --- /dev/null +++ b/src/funnelling.py @@ -0,0 +1,124 @@ +from src.models.learners import * +from src.util.common import _normalize +from src.view_generators import VanillaFunGen + + +class DocEmbedderList: + """ + Class that takes care of calling fit and transform function for every init embedder. Every ViewGenerator should be + contained by this class in order to seamlessly train the overall architecture. + """ + def __init__(self, embedder_list, probabilistic=True): + """ + Init the DocEmbedderList. + :param embedder_list: list of embedders to be deployed + :param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not + """ + assert len(embedder_list) != 0, 'Embedder list cannot be empty!' + self.embedders = embedder_list + self.probabilistic = probabilistic + if probabilistic: + _tmp = [] + for embedder in self.embedders: + if isinstance(embedder, VanillaFunGen): + _tmp.append(embedder) + else: + _tmp.append(FeatureSet2Posteriors(embedder)) + self.embedders = _tmp + + def fit(self, lX, ly): + """ + Fit all the ViewGenerators contained by DocEmbedderList. + :param lX: + :param ly: + :return: self + """ + for embedder in self.embedders: + embedder.fit(lX, ly) + return self + + def transform(self, lX): + """ + Project documents by means of every ViewGenerators. Projections are then averaged together and returned. + :param lX: + :return: common latent space (averaged). + """ + langs = sorted(lX.keys()) + lZparts = {lang: None for lang in langs} + + for embedder in self.embedders: + lZ = embedder.transform(lX) + for lang in langs: + Z = lZ[lang] + if lZparts[lang] is None: + lZparts[lang] = Z + else: + lZparts[lang] += Z + n_embedders = len(self.embedders) + return {lang: lZparts[lang]/n_embedders for lang in langs} # Averaging feature spaces + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class FeatureSet2Posteriors: + """ + Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of + a multiclass SVM. + """ + def __init__(self, embedder, l2=True, n_jobs=-1): + """ + Init the class. + :param embedder: ViewGen, view generators which does not natively outputs posterior probabilities. + :param l2: bool, whether to apply or not L2 normalization to the projection + :param n_jobs: int, number of concurrent workers. + """ + self.embedder = embedder + self.l2 = l2 + self.n_jobs = n_jobs + self.prob_classifier = MetaClassifier( + SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) + + def fit(self, lX, ly): + lZ = self.embedder.fit_transform(lX, ly) + self.prob_classifier.fit(lZ, ly) + return self + + def transform(self, lX): + lP = self.predict_proba(lX) + lP = _normalize(lP, self.l2) + return lP + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + def predict(self, lX): + lZ = self.embedder.transform(lX) + return self.prob_classifier.predict(lZ) + + def predict_proba(self, lX): + lZ = self.embedder.transform(lX) + return self.prob_classifier.predict_proba(lZ) + + +class Funnelling: + """ + Funnelling Architecture. It is composed by two tiers. The first-tier is a set of heterogeneous document embedders. + The second-tier (i.e., the metaclassifier), operates the classification of the common latent space computed by + the first-tier learners. + """ + def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1): + self.first_tier = first_tier + self.meta = meta_classifier + self.n_jobs = n_jobs + + def fit(self, lX, ly): + print('## Fitting first-tier learners!') + lZ = self.first_tier.fit_transform(lX, ly) + print('## Fitting meta-learner!') + self.meta.fit(lZ, ly) + + def predict(self, lX): + lZ = self.first_tier.transform(lX) + ly = self.meta.predict(lZ) + return ly diff --git a/src/learning/transformers.py b/src/learning/transformers.py deleted file mode 100644 index e6c5194..0000000 --- a/src/learning/transformers.py +++ /dev/null @@ -1,849 +0,0 @@ -from torch.optim.lr_scheduler import StepLR -from torch.utils.data import DataLoader -from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain -from embeddings.embeddings import FastTextMUSE -from embeddings.supervised import supervised_embeddings_tfidf, zscores -from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling -from sklearn.decomposition import PCA -from scipy.sparse import hstack -from util_transformers.StandardizeTransformer import StandardizeTransformer -from util.SIF_embed import remove_pc -from sklearn.preprocessing import normalize -from scipy.sparse import csr_matrix -from models.mBert import * -from models.lstm_class import * -from util.csv_log import CSVLog -from util.file import get_file_name -from util.early_stop import EarlyStopping -from util.common import * -import time - - -# ------------------------------------------------------------------ -# Data Processing -# ------------------------------------------------------------------ - - -class FeatureWeight: - - def __init__(self, weight='tfidf', agg='mean'): - assert weight in ['tfidf', 'pmi', 'ig'] or callable( - weight), 'weight should either be "tfidf" or a callable function' - assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"' - self.weight = weight - self.agg = agg - self.fitted = False - if weight == 'pmi': - self.weight = pointwise_mutual_information - elif weight == 'ig': - self.weight = information_gain - - def fit(self, lX, ly): - if not self.fitted: - if self.weight == 'tfidf': - self.lF = {l: np.ones(X.shape[1]) for l, X in lX.items()} - else: - self.lF = {} - for l in lX.keys(): - X, y = lX[l], ly[l] - - print(f'getting supervised cell-matrix lang {l}') - tsr_matrix = get_tsr_matrix(get_supervised_matrix(X, y), tsr_score_funtion=self.weight) - if self.agg == 'max': - F = tsr_matrix.max(axis=0) - elif self.agg == 'mean': - F = tsr_matrix.mean(axis=0) - self.lF[l] = F - - self.fitted = True - return self - - def transform(self, lX): - return {lang: csr_matrix.multiply(lX[lang], self.lF[lang]) for lang in lX.keys()} - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - -# ------------------------------------------------------------------ -# View Generators (aka first-tier learners) -# ------------------------------------------------------------------ - - -class PosteriorProbabilitiesEmbedder: - - def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1): - self.fist_tier_learner = first_tier_learner - self.fist_tier_parameters = first_tier_parameters - self.l2 = l2 - self.n_jobs = n_jobs - self.doc_projector = NaivePolylingualClassifier( - self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs - ) - self.requires_tfidf = True - - def fit(self, lX, lY, lV=None, called_by_viewgen=False): - if not called_by_viewgen: - # Avoid printing if method is called by another View Gen (e.g., GRU ViewGen) - print('### Posterior Probabilities View Generator (X)') - print('fitting the projectors... {}'.format(lX.keys())) - self.doc_projector.fit(lX, lY) - return self - - def transform(self, lX): - lZ = self.predict_proba(lX) - lZ = _normalize(lZ, self.l2) - return lZ - - def fit_transform(self, lX, ly=None, lV=None): - return self.fit(lX, ly).transform(lX) - - def best_params(self): - return self.doc_projector.best_params() - - def predict(self, lX, ly=None): - return self.doc_projector.predict(lX) - - def predict_proba(self, lX, ly=None): - print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents') - return self.doc_projector.predict_proba(lX) - - def _get_output_dim(self): - return len(self.doc_projector.model['da'].model.classes_) - - -class MuseEmbedder: - - def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False): - self.path = path - self.lV = lV - self.l2 = l2 - self.n_jobs = n_jobs - self.featureweight = featureweight - self.sif = sif - self.requires_tfidf = True - - def fit(self, lX, ly, lV=None): - assert lV is not None or self.lV is not None, 'lV not specified' - print('### MUSE View Generator (M)') - print(f'Loading fastText pretrained vectors for languages {list(lX.keys())}...') - self.langs = sorted(lX.keys()) - self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs) - lWordList = {l: self._get_wordlist_from_word2index(lV[l]) for l in self.langs} - self.MUSE = {l: Muse.extract(lWordList[l]).numpy() for l, Muse in self.MUSE.items()} - self.featureweight.fit(lX, ly) - return self - - def transform(self, lX): - MUSE = self.MUSE - lX = self.featureweight.transform(lX) - XdotMUSE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs - ) - lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)} - lMuse = _normalize(lMuse, self.l2) - return lMuse - - def fit_transform(self, lX, ly, lV): - return self.fit(lX, ly, lV).transform(lX) - - def _get_wordlist_from_word2index(self, word2index): - return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0] - - def _get_output_dim(self): - return self.MUSE['da'].shape[1] - - -class WordClassEmbedder: - - def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False): - self.n_jobs = n_jobs - self.l2 = l2 - self.max_label_space = max_label_space - self.featureweight = featureweight - self.sif = sif - self.requires_tfidf = True - - def fit(self, lX, ly, lV=None): - print('### WCE View Generator (M)') - print('Computing supervised embeddings...') - self.langs = sorted(lX.keys()) - WCE = Parallel(n_jobs=self.n_jobs)( - delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs - ) - self.lWCE = {l: WCE[i] for i, l in enumerate(self.langs)} - self.featureweight.fit(lX, ly) - return self - - def transform(self, lX): - lWCE = self.lWCE - lX = self.featureweight.transform(lX) - XdotWCE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], lWCE[lang], self.sif) for lang in self.langs - ) - lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)} - lwce = _normalize(lwce, self.l2) - return lwce - - def fit_transform(self, lX, ly, lV=None): - return self.fit(lX, ly).transform(lX) - - def _get_output_dim(self): - return 73 # TODO ! - - -class MBertEmbedder: - - def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None, - nC=None): - self.doc_embed_path = doc_embed_path - self.patience = patience - self.checkpoint_dir = checkpoint_dir - self.fitted = False - self.requires_tfidf = False - if path_to_model is None and nC is not None: - self.model = None - else: - config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, - num_labels=nC) - self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda() - self.fitted = True - - def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1): - print('### mBERT View Generator (B)') - if self.fitted is True: - print('Bert model already fitted!') - return self - - print('Fine-tune mBert on the given dataset.') - l_tokenized_tr = do_tokenization(lX, max_len=512) - l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly, - val_prop=0.2, max_val=2000, - seed=seed) # TODO: seed - - tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) - va_dataset = TrainingDataset(l_split_va, l_split_val_target) - tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True) - va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True) - - nC = tr_dataset.get_nclasses() - model = get_model(nC) - model = model.cuda() - criterion = torch.nn.BCEWithLogitsLoss().cuda() - optim = init_optimizer(model, lr=lr, weight_decay=0.01) - lr_scheduler = StepLR(optim, step_size=25, gamma=0.1) - early_stop = EarlyStopping(model, optimizer=optim, patience=self.patience, - checkpoint=self.checkpoint_dir, - is_bert=True) - - # Training loop - logfile = '../log/log_mBert_extractor.csv' - method_name = 'mBert_feature_extractor' - - tinit = time() - lang_ids = va_dataset.lang_ids - for epoch in range(1, nepochs + 1): - print('# Start Training ...') - train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile) - lr_scheduler.step() # reduces the learning rate # TODO arg epoch? - - # Validation - macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va') - early_stop(macrof1, epoch) - - if early_stop.STOP: - print('[early-stop] STOP') - break - - model = early_stop.restore_checkpoint() - self.model = model.cuda() - - if val_epochs > 0: - print(f'running last {val_epochs} training epochs on the validation set') - for val_epoch in range(1, val_epochs + 1): - train(self.model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile) - - self.fitted = True - return self - - def transform(self, lX): - assert self.fitted is True, 'Calling transform without any initialized model! - call init first or on init' \ - 'pass the "path_to_model" arg.' - print('Obtaining document embeddings from pretrained mBert ') - l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True) - feat_dataset = ExtractorDataset(l_tokenized_X) - feat_lang_ids = feat_dataset.lang_ids - dataloader = DataLoader(feat_dataset, batch_size=64) - all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model) - return all_batch_embeddings - - def fit_transform(self, lX, ly, lV=None): - return self.fit(lX, ly).transform(lX) - - -class RecurrentEmbedder: - - def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3, - we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10, - test_each=0, checkpoint_dir='../checkpoint', model_path=None): - self.pretrained = pretrained - self.supervised = supervised - self.concat = concat - self.requires_tfidf = False - self.multilingual_dataset = multilingual_dataset - self.model = None - self.we_path = we_path - self.langs = multilingual_dataset.langs() - self.hidden_size = hidden_size - self.sup_drop = sup_drop - self.posteriors = posteriors - self.patience = patience - self.checkpoint_dir = checkpoint_dir - self.test_each = test_each - self.options = options - self.seed = options.seed - self.is_trained = False - - ## INIT MODEL for training - self.lXtr, self.lytr = self.multilingual_dataset.training(target_as_csr=True) - self.lXte, self.lyte = self.multilingual_dataset.test(target_as_csr=True) - self.nC = self.lyte[self.langs[0]].shape[1] - lpretrained, lpretrained_vocabulary = self._load_pretrained_embeddings(self.we_path, self.langs) - self.multilingual_index = MultilingualIndex() - self.multilingual_index.index(self.lXtr, self.lytr, self.lXte, lpretrained_vocabulary) - self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed) - self.multilingual_index.embedding_matrices(lpretrained, self.supervised) - - if model_path is not None: - self.is_trained = True - self.model = torch.load(model_path) - else: - self.model = self._init_Net() - - self.optim = init_optimizer(self.model, lr=lr) - self.criterion = torch.nn.BCEWithLogitsLoss().cuda() - self.lr_scheduler = StepLR(self.optim, step_size=25, gamma=0.5) - self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience, - checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}') - # Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities - self.posteriorEmbedder = MetaClassifier( - SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs) - - def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1): - print('### Gated Recurrent Unit View Generator (G)') - # could be better to init model here at first .fit() call! - if self.model is None: - print('TODO: Init model!') - if not self.is_trained: - # Batchify input - self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed) - l_train_index, l_train_target = self.multilingual_index.l_train() - l_val_index, l_val_target = self.multilingual_index.l_val() - l_test_index = self.multilingual_index.l_test_index() - batcher_train = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs, - lpad=self.multilingual_index.l_pad()) - batcher_eval = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs, - lpad=self.multilingual_index.l_pad()) - - # Train loop - print('Start training') - method_name = 'gru_view_generator' - logfile = init_logfile_nn(method_name, self.options) - tinit = time.time() - for epoch in range(1, nepochs + 1): - train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target, - tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim, - epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None, - ltrain_bert=None) - self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch? - - # validation step - macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch, - logfile, self.criterion, 'va') - - self.early_stop(macrof1, epoch) - if self.test_each > 0: - test_gru(self.model, batcher_eval, l_test_index, None, None, self.lyte, tinit, epoch, - logfile, self.criterion, 'te') - - if self.early_stop.STOP: - print('[early-stop] STOP') - print('Restoring best model...') - break - - self.model = self.early_stop.restore_checkpoint() - print(f'running last {val_epochs} training epochs on the validation set') - for val_epoch in range(1, val_epochs+1): - batcher_train.init_offset() - train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target, - tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim, - epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None, - ltrain_bert=None) - self.is_trained = True - - # Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities - lX = self._get_doc_embeddings(lX) - # Fit a ''multi-lingual'' SVM on the generated doc embeddings - self.posteriorEmbedder.fit(lX, ly) - return self - - def transform(self, lX, batch_size=64): - lX = self._get_doc_embeddings(lX) - return self.posteriorEmbedder.predict_proba(lX) - - def fit_transform(self, lX, ly, lV=None): - # TODO - return 0 - - def _get_doc_embeddings(self, lX, batch_size=64): - assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!' - print('Generating document embeddings via GRU') - data = {} - for lang in lX.keys(): - indexed = index(data=lX[lang], - vocab=self.multilingual_index.l_index[lang].word2index, - known_words=set(self.multilingual_index.l_index[lang].word2index.keys()), - analyzer=self.multilingual_index.l_vectorizer.get_analyzer(lang), - unk_index=self.multilingual_index.l_index[lang].unk_index, - out_of_vocabulary=self.multilingual_index.l_index[lang].out_of_vocabulary) - data[lang] = indexed - - lX = {} - ly = {} - batcher_transform = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs, - lpad=self.multilingual_index.l_pad()) - - # l_devel_index = self.multilingual_index.l_devel_index() - - l_devel_target = self.multilingual_index.l_devel_target() - l_devel_target = {k: v[:len(data[k])] for k, v in l_devel_target.items()} # todo -> debug - for batch, _, target, lang, in batchify(l_index=data, - l_post=None, - llabels=l_devel_target, - batchsize=batch_size, - lpad=self.multilingual_index.l_pad()): - # for idx, (batch, post, bert_emb, target, lang) in enumerate( - # batcher_transform.batchify(l_devel_index, None, None, l_devel_target)): - # for idx, (batch, post, bert_emb, target, lang) in enumerate( - # batcher_transform.batchify(data, None, None, l_devel_target)): - if lang not in lX.keys(): - lX[lang] = self.model.get_embeddings(batch, lang) - ly[lang] = target.cpu().detach().numpy() - else: - lX[lang] = np.concatenate((lX[lang], self.model.get_embeddings(batch, lang)), axis=0) - ly[lang] = np.concatenate((ly[lang], target.cpu().detach().numpy()), axis=0) - - return lX - - # loads the MUSE embeddings if requested, or returns empty dictionaries otherwise - def _load_pretrained_embeddings(self, we_path, langs): - lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ? - lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1) - lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} - return lpretrained, lpretrained_vocabulary - - def _none_dict(self, langs): - return {l:None for l in langs} - - # instantiates the net, initializes the model parameters, and sets embeddings trainable if requested - def _init_Net(self, xavier_uniform=True): - model = RNNMultilingualClassifier( - output_size=self.nC, - hidden_size=self.hidden_size, - lvocab_size=self.multilingual_index.l_vocabsize(), - learnable_length=0, - lpretrained=self.multilingual_index.l_embeddings(), - drop_embedding_range=self.multilingual_index.sup_range, - drop_embedding_prop=self.sup_drop, - post_probabilities=self.posteriors - ) - return model.cuda() - - -class DocEmbedderList: - - def __init__(self, *embedder_list, aggregation='concat'): - assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"' - if len(embedder_list) == 0: - embedder_list = [] - self.embedders = embedder_list - self.aggregation = aggregation - print(f'Aggregation mode: {self.aggregation}') - - def fit(self, lX, ly, lV=None, tfidf=None): - for transformer in self.embedders: - _lX = lX - if transformer.requires_tfidf: - _lX = tfidf - transformer.fit(_lX, ly, lV) - return self - - def transform(self, lX, tfidf=None): - if self.aggregation == 'concat': - return self.transform_concat(lX, tfidf) - elif self.aggregation == 'mean': - return self.transform_mean(lX, tfidf) - - def transform_concat(self, lX, tfidf): - if len(self.embedders) == 1: - if self.embedders[0].requires_tfidf: - lX = tfidf - return self.embedders[0].transform(lX) - - some_sparse = False - langs = sorted(lX.keys()) - - lZparts = {l: [] for l in langs} - for transformer in self.embedders: - _lX = lX - if transformer.requires_tfidf: - _lX = tfidf - lZ = transformer.transform(_lX) - for l in langs: - Z = lZ[l] - some_sparse = some_sparse or issparse(Z) - lZparts[l].append(Z) - - hstacker = hstack if some_sparse else np.hstack - return {l: hstacker(lZparts[l]) for l in langs} - - def transform_mean(self, lX, tfidf): - if len(self.embedders) == 1: - return self.embedders[0].transform(lX) - - langs = sorted(lX.keys()) - - lZparts = {l: None for l in langs} - - # min_dim = min([transformer._get_output_dim() for transformer in self.embedders]) - min_dim = 73 # TODO <---- this should be the number of target classes - - for transformer in self.embedders: - _lX = lX - if transformer.requires_tfidf: - _lX = tfidf - lZ = transformer.transform(_lX) - nC = min([lZ[lang].shape[1] for lang in langs]) - for l in langs: - Z = lZ[l] - if Z.shape[1] > min_dim: - print( - f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.' - f'Applying PCA(n_components={min_dim})') - pca = PCA(n_components=min_dim) - Z = pca.fit(Z).transform(Z) - if lZparts[l] is None: - lZparts[l] = Z - else: - lZparts[l] += Z - - n_transformers = len(self.embedders) - - return {l: lZparts[l] / n_transformers for l in langs} - - def fit_transform(self, lX, ly, lV=None, tfidf=None): - return self.fit(lX, ly, lV, tfidf).transform(lX, tfidf) - - def best_params(self): - return {'todo'} - - def append(self, embedder): - self.embedders.append(embedder) - - -class FeatureSet2Posteriors: - def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1): - self.transformer = transformer - self.l2 = l2 - self.n_jobs = n_jobs - self.prob_classifier = MetaClassifier( - SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) - self.requires_tfidf = requires_tfidf - - def fit(self, lX, ly, lV=None): - if lV is None and hasattr(self.transformer, 'lV'): - lV = self.transformer.lV - lZ = self.transformer.fit_transform(lX, ly, lV) - self.prob_classifier.fit(lZ, ly) - return self - - def transform(self, lX): - lP = self.predict_proba(lX) - lP = _normalize(lP, self.l2) - return lP - - def fit_transform(self, lX, ly, lV): - return self.fit(lX, ly, lV).transform(lX) - - def predict(self, lX, ly=None): - lZ = self.transformer.transform(lX) - return self.prob_classifier.predict(lZ) - - def predict_proba(self, lX, ly=None): - lZ = self.transformer.transform(lX) - return self.prob_classifier.predict_proba(lZ) - - -# ------------------------------------------------------------------ -# Meta-Classifier (aka second-tier learner) -# ------------------------------------------------------------------ -class MetaClassifier: - - def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None): - self.n_jobs = n_jobs - self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs) - self.standardize_range = standardize_range - - def fit(self, lZ, ly): - tinit = time.time() - Z, y = self.stack(lZ, ly) - - self.standardizer = StandardizeTransformer(range=self.standardize_range) - Z = self.standardizer.fit_transform(Z) - - print('fitting the Z-space of shape={}'.format(Z.shape)) - self.model.fit(Z, y) - self.time = time.time() - tinit - - def stack(self, lZ, ly=None): - langs = list(lZ.keys()) - Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space - if ly is not None: - y = np.vstack([ly[lang] for lang in langs]) - return Z, y - else: - return Z - - def predict(self, lZ, ly=None): - lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) - return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) - - def predict_proba(self, lZ, ly=None): - lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) - return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs) - - def best_params(self): - return self.model.best_params() - - -# ------------------------------------------------------------------ -# Ensembling (aka Funnelling) -# ------------------------------------------------------------------ -class Funnelling: - def __init__(self, - vectorizer: TfidfVectorizerMultilingual, - first_tier: DocEmbedderList, - meta: MetaClassifier): - self.vectorizer = vectorizer - self.first_tier = first_tier - self.meta = meta - self.n_jobs = meta.n_jobs - - def fit(self, lX, ly): - tfidf_lX = self.vectorizer.fit_transform(lX, ly) - lV = self.vectorizer.vocabulary() - print('## Fitting first-tier learners!') - lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX) - print('## Fitting meta-learner!') - self.meta.fit(lZ, ly) - - def predict(self, lX, ly=None): - tfidf_lX = self.vectorizer.transform(lX) - lZ = self.first_tier.transform(lX, tfidf=tfidf_lX) - ly_ = self.meta.predict(lZ) - return ly_ - - def best_params(self): - return {'1st-tier': self.first_tier.best_params(), - 'meta': self.meta.best_params()} - - -class Voting: - def __init__(self, *prob_classifiers): - assert all([hasattr(p, 'predict_proba') for p in prob_classifiers]), 'not all classifiers are probabilistic' - self.prob_classifiers = prob_classifiers - - def fit(self, lX, ly, lV=None): - for classifier in self.prob_classifiers: - classifier.fit(lX, ly, lV) - - def predict(self, lX, ly=None): - lP = {l: [] for l in lX.keys()} - for classifier in self.prob_classifiers: - lPi = classifier.predict_proba(lX) - for l in lX.keys(): - lP[l].append(lPi[l]) - - lP = {l: np.stack(Plist).mean(axis=0) for l, Plist in lP.items()} - ly = {l: P > 0.5 for l, P in lP.items()} - - return ly - - -# ------------------------------------------------------------------------------ -# HELPERS -# ------------------------------------------------------------------------------ - -def load_muse_embeddings(we_path, langs, n_jobs=-1): - MUSE = Parallel(n_jobs=n_jobs)( - delayed(FastTextMUSE)(we_path, lang) for lang in langs - ) - return {l: MUSE[i] for i, l in enumerate(langs)} - - -def word_class_embedding_matrix(X, Y, max_label_space=300): - WCE = supervised_embeddings_tfidf(X, Y) - WCE = zscores(WCE, axis=0) - - nC = Y.shape[1] - if nC > max_label_space: - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying PCA(n_components={max_label_space})') - pca = PCA(n_components=max_label_space) - WCE = pca.fit(WCE).transform(WCE) - - return WCE - - -def XdotM(X, M, sif): - E = X.dot(M) - if sif: - print("removing pc...") - E = remove_pc(E, npc=1) - return E - - -def _normalize(lX, l2=True): - return {l: normalize(X) for l, X in lX.items()} if l2 else lX - - -class BatchGRU: - def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500): - self.batchsize = batchsize - self.batches_per_epoch = batches_per_epoch - self.languages = languages - self.lpad=lpad - self.max_pad_length=max_pad_length - self.init_offset() - - def init_offset(self): - self.offset = {lang: 0 for lang in self.languages} - - def batchify(self, l_index, l_post, l_bert, llabels): - langs = self.languages - l_num_samples = {l:len(l_index[l]) for l in langs} - - max_samples = max(l_num_samples.values()) - n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0) - if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches: - n_batches = self.batches_per_epoch - - for b in range(n_batches): - for lang in langs: - index, labels = l_index[lang], llabels[lang] - offset = self.offset[lang] - if offset >= l_num_samples[lang]: - offset = 0 - limit = offset+self.batchsize - - batch_slice = slice(offset, limit) - batch = index[batch_slice] - batch_labels = labels[batch_slice].toarray() - - post = None - bert_emb = None - - batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length) - batch = torch.LongTensor(batch).cuda() - target = torch.FloatTensor(batch_labels).cuda() - - self.offset[lang] = limit - - yield batch, post, bert_emb, target, lang - - -def pad(index_list, pad_index, max_pad_length=None): - pad_length = np.max([len(index) for index in index_list]) - if max_pad_length is not None: - pad_length = min(pad_length, max_pad_length) - for i,indexes in enumerate(index_list): - index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length] - return index_list - - -def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, optim, epoch, method_name, opt, - ltrain_posteriors=None, ltrain_bert=None, log_interval=10): - _dataset_path = opt.dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - - loss_history = [] - model.train() - for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)): - optim.zero_grad() - loss = criterion(model(batch, post, bert_emb, lang), target) - loss.backward() - clip_gradient(model) - optim.step() - loss_history.append(loss.item()) - - if idx % log_interval == 0: - interval_loss = np.mean(loss_history[-log_interval:]) - print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, ' - f'Training Loss: {interval_loss:.6f}') - - mean_loss = np.mean(interval_loss) - logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time.time() - tinit) - return mean_loss - - -def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix): - loss_history = [] - model.eval() - langs = sorted(ltest_index.keys()) - predictions = {l: [] for l in langs} - yte_stacked = {l: [] for l in langs} - batcher.init_offset() - for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), - desc='evaluation: '): - logits = model(batch, post, bert_emb, lang) - loss = criterion(logits, target).item() - prediction = predict(logits) - predictions[lang].append(prediction) - yte_stacked[lang].append(target.detach().cpu().numpy()) - loss_history.append(loss) - - ly = {l:np.vstack(yte_stacked[l]) for l in langs} - ly_ = {l:np.vstack(predictions[l]) for l in langs} - l_eval = evaluate(ly, ly_) - metrics = [] - for lang in langs: - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if measure_prefix == 'te': - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) - print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - - mean_loss = np.mean(loss_history) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time.time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time.time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time.time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time.time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time.time() - tinit) - - return Mf1 - - -def clip_gradient(model, clip_value=1e-1): - params = list(filter(lambda p: p.grad is not None, model.parameters())) - for p in params: - p.grad.data.clamp_(-clip_value, clip_value) - - -def init_logfile_nn(method_name, opt): - logfile = CSVLog(opt.logfile_gru, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) - logfile.set_default('dataset', opt.dataset) - logfile.set_default('run', opt.seed) - logfile.set_default('method', method_name) - assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \ - f'and run {opt.seed} already calculated' - return logfile diff --git a/src/main_gFun.py b/src/main_gFun.py deleted file mode 100644 index 65ed2b9..0000000 --- a/src/main_gFun.py +++ /dev/null @@ -1,144 +0,0 @@ -import os -from dataset_builder import MultilingualDataset -from learning.transformers import * -from util.evaluation import * -from util.file import exists -from util.results import PolylingualClassificationResults -from util.common import * -from util.parser_options import * - -if __name__ == '__main__': - (op, args) = parser.parse_args() - dataset = op.dataset - assert exists(dataset), 'Unable to find file '+str(dataset) - assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' - assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \ - 'empty set of document embeddings is not allowed' - assert (op.gruWCE or op.gruMUSE) and op.gruViewGenerator, 'Initializing Gated Recurrent embedding layer without ' \ - 'explicit initialization of GRU View Generator' - - l2 = op.l2 - dataset_file = os.path.basename(dataset) - results = PolylingualClassificationResults('../log/' + op.output) - allprob = 'Prob' if op.allprob else '' - - # renaming arguments to be printed on log - method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert, - op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob) - print(f'Method: gFun{method_name}\nDataset: {dataset_name}') - print('-'*50) - - # set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect - standardize_range = slice(0, 0) - if op.zscore: - standardize_range = None - - # load dataset - data = MultilingualDataset.load(dataset) - data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING - data.show_dimensions() - lXtr, lytr = data.training() - lXte, lyte = data.test() - - # text preprocessing - tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - # feature weighting (for word embeddings average) - feat_weighting = FeatureWeight(op.feat_weight, agg='mean') - - # document embedding modules aka View Generators - doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat') - - # init View Generators - if op.posteriors: - """ - View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means - of a set of SVM. - """ - doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, - kernel='linear', - C=op.set_c), l2=l2)) - - if op.supervised: - """ - View Generator (-W): generates document representation via Word-Class-Embeddings. - Document embeddings are obtained via weighted sum of document's constituent embeddings. - """ - wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif) - if op.allprob: - wce = FeatureSet2Posteriors(wce, requires_tfidf=True, l2=l2) - doc_embedder.append(wce) - - if op.pretrained: - """ - View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word - embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. - """ - muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif) - if op.allprob: - muse = FeatureSet2Posteriors(muse, requires_tfidf=True, l2=l2) - doc_embedder.append(muse) - - if op.gruViewGenerator: - """ - View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be - initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Such - document embeddings are then casted into vectors of posterior probabilities via a set of SVM. - NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob - """ - op.gru_path = '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' # TODO DEBUG - op.gru_path = None - rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data, - options=op, model_path=op.gru_path, we_path=op.we_path) - doc_embedder.append(rnn_embedder) - - if op.mbert: - """ - View generator (-B): generates document embedding via mBERT model. - """ - op.bert_path = '/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-rcv1-2_run0' # TODO DEBUG - mbert = MBertEmbedder(path_to_model=op.bert_path, - nC=data.num_categories()) - if op.allprob: - mbert = FeatureSet2Posteriors(mbert, l2=l2) - doc_embedder.append(mbert) - - # metaclassifier - meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] - meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c), - meta_parameters=get_params(op.optimc), standardize_range=standardize_range) - - # ensembling the modules - classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) - - print('\n# Fitting Funnelling Architecture...') - tinit = time.time() - classifier.fit(lXtr, lytr) - time = time.time()-tinit - - print('\n# Evaluating ...') - l_eval = evaluate_method(classifier, lXte, lyte) - - metrics = [] - for lang in lXte.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - results.add_row(method='MultiModal', - learner='SVM', - optimp=op.optimc, - sif=op.sif, - zscore=op.zscore, - l2=op.l2, - wescaler=op.feat_weight, - pca=op.max_labels_S, - id=method_name, - dataset=dataset_name, - time=time, - lang=lang, - macrof1=macrof1, - microf1=microf1, - macrok=macrok, - microk=microk, - notes='') - print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) diff --git a/src/models/cnn_class_bu.py b/src/models/cnn_class_bu.py deleted file mode 100644 index a47d5fc..0000000 --- a/src/models/cnn_class_bu.py +++ /dev/null @@ -1,42 +0,0 @@ -import torch.nn as nn -from torch.nn import functional as F -import torch - -class CNN_pdr(nn.Module): - - def __init__(self, output_size, out_channels, compositional_dim, vocab_size, emb_dim, embeddings=None, drop_embedding_range=None, - drop_embedding_prop=0, drop_prob=0.5): - super(CNN_pdr, self).__init__() - self.vocab_size = vocab_size - self.emb_dim = emb_dim - self.embeddings = torch.FloatTensor(embeddings) - self.embedding_layer = nn.Embedding(vocab_size, emb_dim, _weight=self.embeddings) - self.kernel_heights = kernel_heights=[3,5,7] - self.stride = 1 - self.padding = 0 - self.drop_embedding_range = drop_embedding_range - self.drop_embedding_prop = drop_embedding_prop - assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' - self.nC = 73 - - self.conv1 = nn.Conv2d(1, compositional_dim, (self.kernel_heights[0], self.emb_dim), self.stride, self.padding) - self.dropout = nn.Dropout(drop_prob) - self.label = nn.Linear(len(kernel_heights) * out_channels, output_size) - self.fC = nn.Linear(compositional_dim + self.nC, self.nC) - - - def forward(self, x, svm_output): - x = torch.LongTensor(x) - svm_output = torch.FloatTensor(svm_output) - x = self.embedding_layer(x) - x = self.conv1(x.unsqueeze(1)) - x = F.relu(x.squeeze(3)) - x = F.max_pool1d(x, x.size()[2]).squeeze(2) - x = torch.cat((x, svm_output), 1) - x = F.sigmoid(self.fC(x)) - return x #.detach().numpy() - - # logits = self.label(x) - # return logits - - diff --git a/src/models/helpers.py b/src/models/helpers.py index 93e5805..b466f28 100755 --- a/src/models/helpers.py +++ b/src/models/helpers.py @@ -3,25 +3,29 @@ import torch.nn as nn from torch.nn import functional as F - -def init_embeddings(pretrained, vocab_size, learnable_length, device='cuda'): +def init_embeddings(pretrained, vocab_size, learnable_length): + """ + Compute the embedding matrix + :param pretrained: + :param vocab_size: + :param learnable_length: + :return: + """ pretrained_embeddings = None pretrained_length = 0 if pretrained is not None: pretrained_length = pretrained.shape[1] assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) + # requires_grad=False sets the embedding layer as NOT trainable pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) - # pretrained_embeddings.to(device) learnable_embeddings = None if learnable_length > 0: learnable_embeddings = nn.Embedding(vocab_size, learnable_length) - # learnable_embeddings.to(device) embedding_length = learnable_length + pretrained_length assert embedding_length > 0, '0-size embeddings' - return pretrained_embeddings, learnable_embeddings, embedding_length diff --git a/src/learning/learners.py b/src/models/learners.py similarity index 65% rename from src/learning/learners.py rename to src/models/learners.py index 89e3830..46737c6 100644 --- a/src/learning/learners.py +++ b/src/models/learners.py @@ -1,9 +1,24 @@ -import numpy as np import time -from scipy.sparse import issparse -from sklearn.multiclass import OneVsRestClassifier -from sklearn.model_selection import GridSearchCV + +import numpy as np from joblib import Parallel, delayed +from scipy.sparse import issparse +from sklearn.model_selection import GridSearchCV +from sklearn.multiclass import OneVsRestClassifier +from sklearn.svm import SVC + +from src.util.standardizer import StandardizeTransformer + + +def get_learner(calibrate=False, kernel='linear', C=1): + """ + instantiate scikit Support Vector Classifier + :param calibrate: boolean, whether to return posterior probabilities or not + :param kernel: string,kernel to be applied to the SVC + :param C: int or dict {'C': list of integer}, Regularization parameter + :return: Support Vector Classifier + """ + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False) def _sort_if_sparse(X): @@ -13,7 +28,7 @@ def _sort_if_sparse(X): def _joblib_transform_multiling(transformer, lX, n_jobs=-1): if n_jobs == 1: - return {lang:transformer(lX[lang]) for lang in lX.keys()} + return {lang: transformer(lX[lang]) for lang in lX.keys()} else: langs = list(lX.keys()) transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs) @@ -25,11 +40,11 @@ class TrivialRejector: self.cats = y.shape[1] return self - def decision_function(self, X): return np.zeros((X.shape[0],self.cats)) + def decision_function(self, X): return np.zeros((X.shape[0], self.cats)) - def predict(self, X): return np.zeros((X.shape[0],self.cats)) + def predict(self, X): return np.zeros((X.shape[0], self.cats)) - def predict_proba(self, X): return np.zeros((X.shape[0],self.cats)) + def predict_proba(self, X): return np.zeros((X.shape[0], self.cats)) def best_params(self): return {} @@ -38,6 +53,7 @@ class NaivePolylingualClassifier: """ Is a mere set of independet MonolingualClassifiers """ + def __init__(self, base_learner, parameters=None, n_jobs=-1): self.base_learner = base_learner self.parameters = parameters @@ -58,10 +74,11 @@ class NaivePolylingualClassifier: _sort_if_sparse(lX[lang]) models = Parallel(n_jobs=self.n_jobs)\ - (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs) + (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]), ly[lang]) for + lang in langs) self.model = {lang: models[i] for i, lang in enumerate(langs)} - self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs} + self.empty_categories = {lang: self.model[lang].empty_categories for lang in langs} self.time = time.time() - tinit return self @@ -72,9 +89,9 @@ class NaivePolylingualClassifier: """ assert self.model is not None, 'predict called before fit' assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' - langs=list(lX.keys()) + langs = list(lX.keys()) scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs) - return {lang:scores[i] for i,lang in enumerate(langs)} + return {lang: scores[i] for i, lang in enumerate(langs)} def predict_proba(self, lX): """ @@ -83,9 +100,10 @@ class NaivePolylingualClassifier: """ assert self.model is not None, 'predict called before fit' assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' - langs=list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs) - return {lang:scores[i] for i,lang in enumerate(langs)} + langs = list(lX.keys()) + scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( + delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs) + return {lang: scores[i] for i, lang in enumerate(langs)} def predict(self, lX): """ @@ -95,14 +113,14 @@ class NaivePolylingualClassifier: assert self.model is not None, 'predict called before fit' assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict' if self.n_jobs == 1: - return {lang:self.model[lang].transform(lX[lang]) for lang in lX.keys()} + return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()} else: langs = list(lX.keys()) scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs) return {lang: scores[i] for i, lang in enumerate(langs)} def best_params(self): - return {l:model.best_params() for l,model in self.model.items()} + return {lang: model.best_params() for lang, model in self.model.items()} class MonolingualClassifier: @@ -117,14 +135,13 @@ class MonolingualClassifier: def fit(self, X, y): if X.shape[0] == 0: print('Warning: X has 0 elements, a trivial rejector will be created') - self.model = TrivialRejector().fit(X,y) + self.model = TrivialRejector().fit(X, y) self.empty_categories = np.arange(y.shape[1]) return self tinit = time.time() _sort_if_sparse(X) - self.empty_categories = np.argwhere(np.sum(y, axis=0)==0).flatten() - + self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten() # multi-class format if len(y.shape) == 2: if self.parameters is not None: @@ -142,13 +159,12 @@ class MonolingualClassifier: self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs, error_score=0, verbose=10) - # print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}') print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}') self.model.fit(X, y) if isinstance(self.model, GridSearchCV): self.best_params_ = self.model.best_params_ print('best parameters: ', self.best_params_) - self.time=time.time()-tinit + self.time = time.time() - tinit return self def decision_function(self, X): @@ -168,4 +184,41 @@ class MonolingualClassifier: return self.model.predict(X) def best_params(self): - return self.best_params_ \ No newline at end of file + return self.best_params_ + + +class MetaClassifier: + + def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None): + self.n_jobs = n_jobs + self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs) + self.standardize_range = standardize_range + + def fit(self, lZ, ly): + tinit = time.time() + Z, y = self.stack(lZ, ly) + + self.standardizer = StandardizeTransformer(range=self.standardize_range) + Z = self.standardizer.fit_transform(Z) + + print('fitting the Z-space of shape={}'.format(Z.shape)) + self.model.fit(Z, y) + self.time = time.time() - tinit + + def stack(self, lZ, ly=None): + langs = list(lZ.keys()) + Z = np.vstack([lZ[lang] for lang in langs]) + if ly is not None: + y = np.vstack([ly[lang] for lang in langs]) + return Z, y + else: + return Z + + def predict(self, lZ): + lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) + return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) + + def predict_proba(self, lZ): + lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) + return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs) + diff --git a/src/models/lstm_class.py b/src/models/lstm_class.py index 98424f1..cd4000b 100755 --- a/src/models/lstm_class.py +++ b/src/models/lstm_class.py @@ -1,8 +1,6 @@ #taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py -import torch -import torch.nn as nn -from torch.autograd import Variable from models.helpers import * +from torch.autograd import Variable class RNNMultilingualClassifier(nn.Module): diff --git a/src/models/mBert.py b/src/models/mBert.py deleted file mode 100644 index e06746c..0000000 --- a/src/models/mBert.py +++ /dev/null @@ -1,249 +0,0 @@ -from copy import deepcopy -import torch -from torch.utils.data import Dataset -from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig -from sklearn.model_selection import train_test_split -from util.evaluation import * -from time import time - - -def predict(logits, classification_type='multilabel'): - if classification_type == 'multilabel': - prediction = torch.sigmoid(logits) > 0.5 - elif classification_type == 'singlelabel': - prediction = torch.argmax(logits, dim=1).view(-1, 1) - else: - print('unknown classification type') - - return prediction.detach().cpu().numpy() - - -class TrainingDataset(Dataset): - """ - data: dict of lang specific tokenized data - labels: dict of lang specific targets - """ - - def __init__(self, data, labels): - self.langs = data.keys() - self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} - - for i, lang in enumerate(self.langs): - _data = data[lang]['input_ids'] - _data = np.array(_data) - _labels = labels[lang] - _lang_value = np.full(len(_data), self.lang_ids[lang]) - - if i == 0: - self.data = _data - self.labels = _labels - self.lang_index = _lang_value - else: - self.data = np.vstack((self.data, _data)) - self.labels = np.vstack((self.labels, _labels)) - self.lang_index = np.concatenate((self.lang_index, _lang_value)) - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - x = self.data[idx] - y = self.labels[idx] - lang = self.lang_index[idx] - - return x, torch.tensor(y, dtype=torch.float), lang - - def get_lang_ids(self): - return self.lang_ids - - def get_nclasses(self): - if hasattr(self, 'labels'): - return len(self.labels[0]) - else: - print('Method called before init!') - - -class ExtractorDataset(Dataset): - """ - data: dict of lang specific tokenized data - labels: dict of lang specific targets - """ - - def __init__(self, data): - self.langs = data.keys() - self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} - - for i, lang in enumerate(self.langs): - _data = data[lang]['input_ids'] - _data = np.array(_data) - _lang_value = np.full(len(_data), self.lang_ids[lang]) - - if i == 0: - self.data = _data - self.lang_index = _lang_value - else: - self.data = np.vstack((self.data, _data)) - self.lang_index = np.concatenate((self.lang_index, _lang_value)) - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - x = self.data[idx] - lang = self.lang_index[idx] - - return x, lang - - def get_lang_ids(self): - return self.lang_ids - - -def get_model(n_out): - print('# Initializing model ...') - model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out) - return model - - -def init_optimizer(model, lr, weight_decay=0): - no_decay = ['bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() - if not any(nd in n for nd in no_decay)], - 'weight_decay': weight_decay}, - {'params': [p for n, p in model.named_parameters() - if any(nd in n for nd in no_decay)], - 'weight_decay': weight_decay} - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=lr) - return optimizer - - -def get_lr(optimizer): - for param_group in optimizer.param_groups: - return param_group['lr'] - - -def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed): - l_split_va = deepcopy(l_tokenized_tr) - l_split_val_target = {l: [] for l in l_tokenized_tr.keys()} - l_split_tr = deepcopy(l_tokenized_tr) - l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()} - - for lang in l_tokenized_tr.keys(): - val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val)) - l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[ - lang] = \ - train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, - random_state=seed, shuffle=True) - - return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target - - -def do_tokenization(l_dataset, max_len=512, verbose=True): - if verbose: - print('# Starting Tokenization ...') - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - langs = l_dataset.keys() - l_tokenized = {} - for lang in langs: - l_tokenized[lang] = tokenizer(l_dataset[lang], - truncation=True, - max_length=max_len, - padding='max_length') - return l_tokenized - - -def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10): - # _dataset_path = opt.dataset.split('/')[-1].split('_') - # dataset_id = _dataset_path[0] + _dataset_path[-1] - dataset_id = 'TODO fix this!' - - loss_history = [] - model.train() - - for idx, (batch, target, lang_idx) in enumerate(train_dataloader): - optim.zero_grad() - out = model(batch.cuda()) - logits = out[0] - loss = criterion(logits, target.cuda()) - loss.backward() - # clip_gradient(model) - optim.step() - loss_history.append(loss.item()) - - if idx % log_interval == 0: - interval_loss = np.mean(loss_history[log_interval:]) - print( - f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') - - mean_loss = np.mean(interval_loss) - logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) - return mean_loss - - -def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix): - print('# Validating model ...') - loss_history = [] - model.eval() - langs = lang_ids.keys() - id_2_lang = {v: k for k, v in lang_ids.items()} - predictions = {l: [] for l in langs} - yte_stacked = {l: [] for l in langs} - - for batch, target, lang_idx in test_dataloader: - out = model(batch.cuda()) - logits = out[0] - loss = criterion(logits, target.cuda()).item() - prediction = predict(logits) - loss_history.append(loss) - - # Assigning prediction to dict in predictions and yte_stacked according to lang_idx - for i, pred in enumerate(prediction): - lang_pred = id_2_lang[lang_idx.numpy()[i]] - predictions[lang_pred].append(pred) - yte_stacked[lang_pred].append(target[i].detach().cpu().numpy()) - - ly = {l: np.vstack(yte_stacked[l]) for l in langs} - ly_ = {l: np.vstack(predictions[l]) for l in langs} - l_eval = evaluate(ly, ly_) - metrics = [] - for lang in langs: - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if measure_prefix == 'te': - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) - print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - - mean_loss = np.mean(loss_history) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) - - return Mf1 - - -def feature_extractor(data, lang_ids, model): - print('# Feature Extractor Mode...') - """ - Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for - the output of each layer) of shape (batch_size, sequence_length, hidden_size) - """ - all_batch_embeddings = {} - id2lang = {v: k for k, v in lang_ids.items()} - with torch.no_grad(): - for batch, lang_idx in data: - # for batch, target, lang_idx in data: - out = model(batch.cuda()) - last_hidden_state = out[1][-1] - batch_embeddings = last_hidden_state[:, 0, :] - for i, l_idx in enumerate(lang_idx.numpy()): - if id2lang[l_idx] not in all_batch_embeddings.keys(): - all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy() - else: - all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], - batch_embeddings[i].detach().cpu().numpy())) - - return all_batch_embeddings, id2lang diff --git a/src/models/pl_bert.py b/src/models/pl_bert.py new file mode 100644 index 0000000..129c3b4 --- /dev/null +++ b/src/models/pl_bert.py @@ -0,0 +1,188 @@ +import pytorch_lightning as pl +import torch +from torch.optim.lr_scheduler import StepLR +from transformers import BertForSequenceClassification, AdamW + +from src.util.common import define_pad_length, pad +from src.util.pl_metrics import CustomF1, CustomK + + +class BertModel(pl.LightningModule): + + def __init__(self, output_size, stored_path, gpus=None): + """ + Init Bert model. + :param output_size: + :param stored_path: + :param gpus: + """ + super().__init__() + self.loss = torch.nn.BCEWithLogitsLoss() + self.gpus = gpus + self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) + self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) + # Language specific metrics to compute metrics at epoch level + self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + + if stored_path: + self.bert = BertForSequenceClassification.from_pretrained(stored_path, + num_labels=output_size, + output_hidden_states=True) + else: + self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', + num_labels=output_size, + output_hidden_states=True) + self.save_hyperparameters() + + def forward(self, X): + logits = self.bert(X) + return logits + + def training_step(self, train_batch, batch_idx): + X, y, _, batch_langs = train_batch + X = torch.cat(X).view([X[0].shape[0], len(X)]) + y = y.type(torch.FloatTensor) + y = y.to('cuda' if self.gpus else 'cpu') + logits, _ = self.forward(X) + loss = self.loss(logits, y) + # Squashing logits through Sigmoid in order to get confidence score + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) + self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True) + lX, ly = self._reconstruct_dict(predictions, y, batch_langs) + return {'loss': loss, 'pred': lX, 'target': ly} + + def training_epoch_end(self, outputs): + langs = [] + for output in outputs: + langs.extend(list(output['pred'].keys())) + langs = set(langs) + # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. + # here we save epoch level metric values and compute them specifically for each language + res_macroF1 = {lang: [] for lang in langs} + res_microF1 = {lang: [] for lang in langs} + res_macroK = {lang: [] for lang in langs} + res_microK = {lang: [] for lang in langs} + for output in outputs: + lX, ly = output['pred'], output['target'] + for lang in lX.keys(): + X, y = lX[lang], ly[lang] + lang_macroF1 = self.lang_macroF1(X, y) + lang_microF1 = self.lang_microF1(X, y) + lang_macroK = self.lang_macroK(X, y) + lang_microK = self.lang_microK(X, y) + + res_macroF1[lang].append(lang_macroF1) + res_microF1[lang].append(lang_microF1) + res_macroK[lang].append(lang_macroK) + res_microK[lang].append(lang_microK) + for lang in langs: + avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang])) + avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang])) + avg_macroK = torch.mean(torch.Tensor(res_macroK[lang])) + avg_microK = torch.mean(torch.Tensor(res_microK[lang])) + self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch) + + def validation_step(self, val_batch, batch_idx): + X, y, _, batch_langs = val_batch + X = torch.cat(X).view([X[0].shape[0], len(X)]) + y = y.type(torch.FloatTensor) + y = y.to('cuda' if self.gpus else 'cpu') + logits, _ = self.forward(X) + loss = self.loss(logits, y) + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) + self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + return {'loss': loss} + + def test_step(self, test_batch, batch_idx): + X, y, _, batch_langs = test_batch + X = torch.cat(X).view([X[0].shape[0], len(X)]) + y = y.type(torch.FloatTensor) + y = y.to('cuda' if self.gpus else 'cpu') + logits, _ = self.forward(X) + loss = self.loss(logits, y) + # Squashing logits through Sigmoid in order to get confidence score + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) + self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + return + + def configure_optimizers(self, lr=3e-5, weight_decay=0.01): + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in self.bert.named_parameters() + if not any(nd in n for nd in no_decay)], + 'weight_decay': weight_decay}, + {'params': [p for n, p in self.bert.named_parameters() + if any(nd in n for nd in no_decay)], + 'weight_decay': weight_decay} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=lr) + scheduler = StepLR(optimizer, step_size=25, gamma=0.1) + return [optimizer], [scheduler] + + def encode(self, lX, batch_size=64): + with torch.no_grad(): + l_embed = {lang: [] for lang in lX.keys()} + for lang in sorted(lX.keys()): + for i in range(0, len(lX[lang]), batch_size): + if i + batch_size > len(lX[lang]): + batch = lX[lang][i:len(lX[lang])] + else: + batch = lX[lang][i:i + batch_size] + max_pad_len = define_pad_length(batch) + batch = pad(batch, pad_index=self.bert.config.pad_token_id, max_pad_length=max_pad_len) + batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') + _, output = self.forward(batch) + + # deleting batch from gpu to avoid cuda OOM + del batch + torch.cuda.empty_cache() + + doc_embeds = output[-1][:, 0, :] + l_embed[lang].append(doc_embeds.cpu()) + for k, v in l_embed.items(): + l_embed[k] = torch.cat(v, dim=0).numpy() + return l_embed + + @staticmethod + def _reconstruct_dict(predictions, y, batch_langs): + reconstructed_x = {lang: [] for lang in set(batch_langs)} + reconstructed_y = {lang: [] for lang in set(batch_langs)} + for i, pred in enumerate(predictions): + reconstructed_x[batch_langs[i]].append(pred) + reconstructed_y[batch_langs[i]].append(y[i]) + for k, v in reconstructed_x.items(): + reconstructed_x[k] = torch.cat(v).view(-1, predictions.shape[1]) + for k, v in reconstructed_y.items(): + reconstructed_y[k] = torch.cat(v).view(-1, predictions.shape[1]) + return reconstructed_x, reconstructed_y diff --git a/src/models/pl_gru.py b/src/models/pl_gru.py new file mode 100644 index 0000000..4adb148 --- /dev/null +++ b/src/models/pl_gru.py @@ -0,0 +1,266 @@ +# Lightning modules, see https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html +import pytorch_lightning as pl +import torch +import torch.nn.functional as F +from torch import nn +from torch.autograd import Variable +from torch.optim.lr_scheduler import StepLR +from transformers import AdamW + +from src.models.helpers import init_embeddings +from src.util.common import define_pad_length, pad +from src.util.pl_metrics import CustomF1, CustomK + + +class RecurrentModel(pl.LightningModule): + def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length, + drop_embedding_range, drop_embedding_prop, gpus=None): + """ + Init RNN model. + :param lPretrained: + :param langs: + :param output_size: + :param hidden_size: + :param lVocab_size: + :param learnable_length: + :param drop_embedding_range: + :param drop_embedding_prop: + :param gpus: + """ + super().__init__() + self.gpus = gpus + self.langs = langs + self.lVocab_size = lVocab_size + self.learnable_length = learnable_length + self.output_size = output_size + self.hidden_size = hidden_size + self.drop_embedding_range = drop_embedding_range + self.drop_embedding_prop = drop_embedding_prop + self.loss = torch.nn.BCEWithLogitsLoss() + + self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) + self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) + # Language specific metrics to compute metrics at epoch level + self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) + self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus) + + self.lPretrained_embeddings = nn.ModuleDict() + self.lLearnable_embeddings = nn.ModuleDict() + + self.n_layers = 1 + self.n_directions = 1 + self.dropout = nn.Dropout(0.6) + + lstm_out = 256 + ff1 = 512 + ff2 = 256 + + lpretrained_embeddings = {} + llearnable_embeddings = {} + + for lang in self.langs: + pretrained = lPretrained[lang] if lPretrained else None + pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( + pretrained, self.lVocab_size[lang], self.learnable_length) + lpretrained_embeddings[lang] = pretrained_embeddings + llearnable_embeddings[lang] = learnable_embeddings + self.embedding_length = embedding_length + + self.lPretrained_embeddings.update(lpretrained_embeddings) + self.lLearnable_embeddings.update(llearnable_embeddings) + + self.rnn = nn.GRU(self.embedding_length, hidden_size) + self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) + self.linear1 = nn.Linear(lstm_out, ff1) + self.linear2 = nn.Linear(ff1, ff2) + self.label = nn.Linear(ff2, self.output_size) + + # TODO: setting lPretrained to None, letting it to its original value will "bug" first validation + # step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow) + lPretrained = None + self.save_hyperparameters() + + def forward(self, lX): + l_embed = [] + for lang in sorted(lX.keys()): + doc_embedding = self.transform(lX[lang], lang) + l_embed.append(doc_embedding) + embed = torch.cat(l_embed, dim=0) + logits = self.label(embed) + return logits + + def transform(self, X, lang): + batch_size = X.shape[0] + X = self.embed(X, lang) + X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + X = X.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).to(self.device)) + output, _ = self.rnn(X, h_0) + output = output[-1, :, :] + output = F.relu(self.linear0(output)) + output = self.dropout(F.relu(self.linear1(output))) + output = self.dropout(F.relu(self.linear2(output))) + return output + + def encode(self, lX, l_pad, batch_size=128): + """ + Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512. + :param lX: + :param l_pad: + :param batch_size: + :return: + """ + with torch.no_grad(): + l_embed = {lang: [] for lang in lX.keys()} + for lang in sorted(lX.keys()): + for i in range(0, len(lX[lang]), batch_size): + if i+batch_size > len(lX[lang]): + batch = lX[lang][i:len(lX[lang])] + else: + batch = lX[lang][i:i+batch_size] + max_pad_len = define_pad_length(batch) + batch = pad(batch, pad_index=l_pad[lang], max_pad_length=max_pad_len) + X = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') + _batch_size = X.shape[0] + X = self.embed(X, lang) + X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + X = X.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, _batch_size, self.hidden_size).to(self.device)) + output, _ = self.rnn(X, h_0) + output = output[-1, :, :] + output = F.relu(self.linear0(output)) + output = self.dropout(F.relu(self.linear1(output))) + l_embed[lang].append(output.cpu()) + for k, v in l_embed.items(): + l_embed[k] = torch.cat(v, dim=0).numpy() + return l_embed + + def training_step(self, train_batch, batch_idx): + lX, ly = train_batch + logits = self.forward(lX) + _ly = [] + for lang in sorted(lX.keys()): + _ly.append(ly[lang]) + y = torch.cat(_ly, dim=0) + loss = self.loss(logits, y) + # Squashing logits through Sigmoid in order to get confidence score + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, y) + macroF1 = self.macroF1(predictions, y) + microK = self.microK(predictions, y) + macroK = self.macroK(predictions, y) + self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True) + self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True) + re_lX = self._reconstruct_dict(predictions, ly) + return {'loss': loss, 'pred': re_lX, 'target': ly} + + def training_epoch_end(self, outputs): + # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. + # here we save epoch level metric values and compute them specifically for each language + res_macroF1 = {lang: [] for lang in self.langs} + res_microF1 = {lang: [] for lang in self.langs} + res_macroK = {lang: [] for lang in self.langs} + res_microK = {lang: [] for lang in self.langs} + for output in outputs: + lX, ly = output['pred'], output['target'] + for lang in lX.keys(): + X, y = lX[lang], ly[lang] + lang_macroF1 = self.lang_macroF1(X, y) + lang_microF1 = self.lang_microF1(X, y) + lang_macroK = self.lang_macroK(X, y) + lang_microK = self.lang_microK(X, y) + + res_macroF1[lang].append(lang_macroF1) + res_microF1[lang].append(lang_microF1) + res_macroK[lang].append(lang_macroK) + res_microK[lang].append(lang_microK) + for lang in self.langs: + avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang])) + avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang])) + avg_macroK = torch.mean(torch.Tensor(res_macroK[lang])) + avg_microK = torch.mean(torch.Tensor(res_microK[lang])) + self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch) + self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch) + + def validation_step(self, val_batch, batch_idx): + lX, ly = val_batch + logits = self.forward(lX) + _ly = [] + for lang in sorted(lX.keys()): + _ly.append(ly[lang]) + ly = torch.cat(_ly, dim=0) + loss = self.loss(logits, ly) + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, ly) + macroF1 = self.macroF1(predictions, ly) + microK = self.microK(predictions, ly) + macroK = self.macroK(predictions, ly) + self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + return {'loss': loss} + + def test_step(self, test_batch, batch_idx): + lX, ly = test_batch + logits = self.forward(lX) + _ly = [] + for lang in sorted(lX.keys()): + _ly.append(ly[lang]) + ly = torch.cat(_ly, dim=0) + predictions = torch.sigmoid(logits) > 0.5 + microF1 = self.microF1(predictions, ly) + macroF1 = self.macroF1(predictions, ly) + microK = self.microK(predictions, ly) + macroK = self.macroK(predictions, ly) + self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) + return + + def embed(self, X, lang): + input_list = [] + if self.lPretrained_embeddings[lang]: + input_list.append(self.lPretrained_embeddings[lang](X)) + if self.lLearnable_embeddings[lang]: + input_list.append(self.lLearnable_embeddings[lang](X)) + return torch.cat(tensors=input_list, dim=2) + + def embedding_dropout(self, X, drop_range, p_drop=0.5, training=True): + if p_drop > 0 and training and drop_range is not None: + p = p_drop + drop_from, drop_to = drop_range + m = drop_to - drop_from # length of the supervised embedding + l = X.shape[2] # total embedding length + corr = (1 - p) + X[:, :, drop_from:drop_to] = corr * F.dropout(X[:, :, drop_from:drop_to], p=p) + X /= (1 - (p * m / l)) + return X + + def configure_optimizers(self): + optimizer = AdamW(self.parameters(), lr=1e-3) + scheduler = StepLR(optimizer, step_size=25, gamma=0.5) + return [optimizer], [scheduler] + + @staticmethod + def _reconstruct_dict(X, ly): + reconstructed = {} + _start = 0 + for lang in sorted(ly.keys()): + lang_batchsize = len(ly[lang]) + reconstructed[lang] = X[_start:_start+lang_batchsize] + _start += lang_batchsize + return reconstructed diff --git a/src/results/results_manager.py b/src/results/results_manager.py deleted file mode 100644 index 1fe57dd..0000000 --- a/src/results/results_manager.py +++ /dev/null @@ -1,11 +0,0 @@ -import pandas as pd -import numpy as np - -# df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t') -df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t') -pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std]) -with pd.option_context('display.max_rows', None): - print(pivot.round(3)) -print('Finished ...') - - diff --git a/src/util/SIF_embed.py b/src/util/SIF_embed.py index cfe096e..4a3d712 100644 --- a/src/util/SIF_embed.py +++ b/src/util/SIF_embed.py @@ -1,6 +1,7 @@ import numpy as np from sklearn.decomposition import TruncatedSVD + def get_weighted_average(We, x, w): """ Compute the weighted average vectors @@ -15,6 +16,7 @@ def get_weighted_average(We, x, w): emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:]) return emb + def compute_pc(X,npc=1): """ Compute the principal components. @@ -26,6 +28,7 @@ def compute_pc(X,npc=1): svd.fit(X) return svd.components_ + def remove_pc(X, npc=1): """ Remove the projection on the principal components @@ -34,7 +37,7 @@ def remove_pc(X, npc=1): :return: XX[i, :] is the data point after removing its projection """ pc = compute_pc(X, npc) - if npc==1: + if npc == 1: XX = X - X.dot(pc.transpose()) * pc else: XX = X - X.dot(pc.transpose()).dot(pc) diff --git a/src/util/common.py b/src/util/common.py old mode 100755 new mode 100644 index 81fd3f2..25f7b5f --- a/src/util/common.py +++ b/src/util/common.py @@ -1,12 +1,4 @@ -import subprocess -import warnings -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.svm import SVC -from sklearn.model_selection import train_test_split -from embeddings.supervised import get_supervised_embeddings -# from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual import numpy as np -from tqdm import tqdm import torch warnings.filterwarnings("ignore", category=DeprecationWarning) @@ -107,201 +99,101 @@ class Index: devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True ) - print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') +from src.util.embeddings_manager import supervised_embeddings_tfidf - def get_word_list(self): - def extract_word_list(word2index): - return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])] - word_list = extract_word_list(self.word2index) - word_list += extract_word_list(self.out_of_vocabulary) - return word_list +class TfidfVectorizerMultilingual: - def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None): - print(f'[generating embedding matrix for lang {self.lang}]') + def __init__(self, **kwargs): + self.kwargs = kwargs - self.wce_range = None - embedding_parts = [] + def fit(self, lX, ly=None): + self.langs = sorted(lX.keys()) + self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} + return self - if pretrained is not None: - print('\t[pretrained-matrix]') - word_list = self.get_word_list() - muse_embeddings = pretrained.extract(word_list) - embedding_parts.append(muse_embeddings) - del pretrained + def transform(self, lX): + return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} - if supervised: - print('\t[supervised-matrix]') - F = get_supervised_embeddings(Xtr, Ytr, reduction=None, method='dotn') - num_missing_rows = self.vocabsize - F.shape[0] - F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1])))) - F = torch.from_numpy(F).float() + def fit_transform(self, lX, ly=None): + return self.fit(lX, ly).transform(lX) - offset = 0 - if embedding_parts: - offset = embedding_parts[0].shape[1] - self.wce_range = [offset, offset + F.shape[1]] + def vocabulary(self, l=None): + if l is None: + return {l: self.vectorizer[l].vocabulary_ for l in self.langs} + else: + return self.vectorizer[l].vocabulary_ - embedding_parts.append(F) + def get_analyzer(self, l=None): + if l is None: + return {l: self.vectorizer[l].build_analyzer() for l in self.langs} + else: + return self.vectorizer[l].build_analyzer() - make_dumps = False - if make_dumps: - print(f'Dumping Embedding Matrices ...') - import pickle - with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile: - pickle.dump((self.lang, embedding_parts, self.word2index), outfile) - with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2: - pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2) - self.embedding_matrix = torch.cat(embedding_parts, dim=1) - - print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]') +def _normalize(lX, l2=True): + return {lang: normalize(X) for lang, X in lX.items()} if l2 else lX def none_dict(langs): - return {l:None for l in langs} + return {l: None for l in langs} class MultilingualIndex: - def __init__(self): #, add_language_trace=False): + def __init__(self): + """ + Class that contains monolingual Indexes + """ self.l_index = {} self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - # self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000) - # self.add_language_trace=add_language_trace} - def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary): + def index(self, l_devel_raw, l_devel_target, l_test_raw, l_test_target, l_pretrained_vocabulary=None): self.langs = sorted(l_devel_raw.keys()) - - #build the vocabularies self.l_vectorizer.fit(l_devel_raw) l_vocabulary = self.l_vectorizer.vocabulary() l_analyzer = self.l_vectorizer.get_analyzer() + if l_pretrained_vocabulary is None: + l_pretrained_vocabulary = none_dict(self.langs) - for l in self.langs: - self.l_index[l] = Index(l_devel_raw[l], l_devel_target[l], l_test_raw[l], l) - self.l_index[l].index(l_pretrained_vocabulary[l], l_analyzer[l], l_vocabulary[l]) + for lang in self.langs: + # Init monolingual Index + self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang], + lang) + # call to index() function of monolingual Index + self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang]) def train_val_split(self, val_prop=0.2, max_val=2000, seed=42): - for l,index in self.l_index.items(): + for l, index in self.l_index.items(): index.train_val_split(val_prop, max_val, seed=seed) - - def embedding_matrices(self, lpretrained, supervised): + """ + Extract from pretrained embeddings words that are found in the training dataset, then for each language + calls the respective monolingual index and build the embedding matrix (if supervised, WCE are concatenated + to the unsupervised vectors). + :param lpretrained: dict {lang : matrix of word-embeddings } + :param supervised: bool, whether to deploy Word-Class Embeddings or not + :return: self + """ lXtr = self.get_lXtr() if supervised else none_dict(self.langs) lYtr = self.l_train_target() if supervised else none_dict(self.langs) - for l,index in self.l_index.items(): - index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l]) + lWordList = self.get_wordlist() + lExtracted = lpretrained.extract(lWordList) + for lang, index in self.l_index.items(): + # if supervised concatenate embedding matrices of pretrained unsupervised + # and supervised word-class embeddings + index.compose_embedding_matrix(lExtracted[lang], supervised, lXtr[lang], lYtr[lang]) self.sup_range = index.wce_range + return self - # TODO circular import with transformers --> when generating posterior prob, we import PosteriorProbabilitiesEmbedder which is defined in transformers - # def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False): - # # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs - # timeit = time.time() - # lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()} - # lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()} - # if not stored_post: - # for l in self.langs: - # n_elements = lXtr[l].shape[0] - # if n_elements > max_training_docs_by_lang: - # choice = np.random.permutation(n_elements)[:max_training_docs_by_lang] - # lXtr[l] = lXtr[l][choice] - # lYtr[l] = lYtr[l][choice] - # - # # train the posterior probabilities embedder - # print('[posteriors] training a calibrated SVM') - # learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto') - # prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False) - # prob_embedder.fit(lXtr, lYtr) - # - # # transforms the training, validation, and test sets into posterior probabilities - # print('[posteriors] generating posterior probabilities') - # lPtr = prob_embedder.transform(self.get_lXtr()) - # lPva = prob_embedder.transform(self.get_lXva()) - # lPte = prob_embedder.transform(self.get_lXte()) - # # NB: Check splits indices ! - # if store_posteriors: - # import pickle - # with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile: - # pickle.dump([lPtr, lPva, lPte], outfile) - # print(f'Successfully dumped posteriors!') - # else: - # import pickle - # with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile: - # lPtr, lPva, lPte = pickle.load(infile) - # print(f'Successfully loaded stored posteriors!') - # print(f'[posteriors] done in {time.time() - timeit}') - # return lPtr, lPva, lPte - - def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False): - show_gpu('GPU memory before initializing mBert model:') - # TODO: load dumped embeddings? - from experiment_scripts.main_mbert_extractor import do_tokenization, ExtractorDataset, DataLoader - from transformers import BertConfig, BertForSequenceClassification - - print('[mBERT] generating mBERT doc embeddings') - lXtr_raw = self.get_raw_lXtr() - lXva_raw = self.get_raw_lXva() - lXte_raw = self.get_raw_lXte() - - print('# Tokenizing datasets') - l_tokenized_tr = do_tokenization(lXtr_raw, max_len=max_len, verbose=False) - tr_dataset = ExtractorDataset(l_tokenized_tr) - tr_lang_ids = tr_dataset.lang_ids - tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=False) - - l_tokenized_va = do_tokenization(lXva_raw, max_len=max_len, verbose=False) - va_dataset = ExtractorDataset(l_tokenized_va) - va_lang_ids = va_dataset.lang_ids - va_dataloader = DataLoader(va_dataset, batch_size=batch_size, shuffle=False) - - l_tokenized_te = do_tokenization(lXte_raw, max_len=max_len, verbose=False) - te_dataset = ExtractorDataset(l_tokenized_te) - te_lang_ids = te_dataset.lang_ids - te_dataloader = DataLoader(te_dataset, batch_size=batch_size, shuffle=False) - - num_labels = self.l_index[self.langs[0]].val_target.shape[1] - config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, - num_labels=num_labels) - model = BertForSequenceClassification.from_pretrained(bert_path, - config=config).cuda() - print('# Extracting document embeddings') - tr_bert_embeddings, id2lang_tr = self.do_bert_embeddings(model, tr_dataloader, tr_lang_ids, verbose=False) - va_bert_embeddings, id2lang_va = self.do_bert_embeddings(model, va_dataloader, va_lang_ids, verbose=False) - te_bert_embeddings, id2lang_te = self.do_bert_embeddings(model, te_dataloader, te_lang_ids, verbose=False) - - show_gpu('GPU memory before after mBert model:') - # Freeing GPU's memory - import gc - del model, tr_dataloader, va_dataloader, te_dataloader - gc.collect() - torch.cuda.empty_cache() - show_gpu('GPU memory after clearing cache:') - return tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings - - - @staticmethod - def do_bert_embeddings(model, data, lang_ids, verbose=True): - if verbose: - print('# Feature Extractor Mode...') - all_batch_embeddings = {} - id2lang = {v: k for k, v in lang_ids.items()} - with torch.no_grad(): - for batch, lang_idx in data: - out = model(batch.cuda()) - last_hidden_state = out[1][-1] - batch_embeddings = last_hidden_state[:, 0, :] - for i, l_idx in enumerate(lang_idx.numpy()): - if id2lang[l_idx] not in all_batch_embeddings.keys(): - all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy() - else: - all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], - batch_embeddings[i].detach().cpu().numpy())) - - return all_batch_embeddings, id2lang + def get_wordlist(self): + wordlist = {} + for lang, index in self.l_index.items(): + wordlist[lang] = index.get_word_list() + return wordlist def get_raw_lXtr(self): - lXtr_raw = {k:[] for k in self.langs} + lXtr_raw = {k: [] for k in self.langs} lYtr_raw = {k: [] for k in self.langs} for lang in self.langs: lXtr_raw[lang] = self.l_index[lang].train_raw @@ -337,11 +229,14 @@ class MultilingualIndex: self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()}) return self.lXte + def get_target_dim(self): + return self.l_index[self.langs[0]].devel_target.shape[1] + def l_vocabsize(self): - return {l:index.vocabsize for l,index in self.l_index.items()} + return {l: index.vocabsize for l, index in self.l_index.items()} def l_embeddings(self): - return {l:index.embedding_matrix for l,index in self.l_index.items()} + return {l: index.embedding_matrix for l, index in self.l_index.items()} def l_pad(self): return {l: index.pad_index for l, index in self.l_index.items()} @@ -349,15 +244,30 @@ class MultilingualIndex: def l_train_index(self): return {l: index.train_index for l, index in self.l_index.items()} + def l_train_raw_index(self): + return {l: index.train_raw for l, index in self.l_index.items()} + def l_train_target(self): return {l: index.train_target for l, index in self.l_index.items()} def l_val_index(self): return {l: index.val_index for l, index in self.l_index.items()} + def l_val_raw_index(self): + return {l: index.val_raw for l, index in self.l_index.items()} + + def l_test_raw_index(self): + return {l: index.test_raw for l, index in self.l_index.items()} + + def l_devel_raw_index(self): + return {l: index.devel_raw for l, index in self.l_index.items()} + def l_val_target(self): return {l: index.val_target for l, index in self.l_index.items()} + def l_test_target(self): + return {l: index.test_target for l, index in self.l_index.items()} + def l_test_index(self): return {l: index.test_index for l, index in self.l_index.items()} @@ -373,161 +283,179 @@ class MultilingualIndex: def l_val(self): return self.l_val_index(), self.l_val_target() + def l_test(self): + return self.l_test_index(), self.l_test_target() -class Batch: - def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500): - self.batchsize = batchsize - self.batches_per_epoch = batches_per_epoch - self.languages = languages - self.lpad=lpad - self.max_pad_length=max_pad_length - self.init_offset() + def l_train_raw(self): + return self.l_train_raw_index(), self.l_train_target() - def init_offset(self): - self.offset = {lang: 0 for lang in self.languages} + def l_val_raw(self): + return self.l_val_raw_index(), self.l_val_target() - def batchify(self, l_index, l_post, l_bert, llabels): # TODO: add bert embedding here... - langs = self.languages - l_num_samples = {l:len(l_index[l]) for l in langs} + def l_test_raw(self): + return self.l_test_raw_index(), self.l_test_target() - max_samples = max(l_num_samples.values()) - n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0) - if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches: - n_batches = self.batches_per_epoch + def l_devel_raw(self): + return self.l_devel_raw_index(), self.l_devel_target() - for b in range(n_batches): - for lang in langs: - index, labels = l_index[lang], llabels[lang] - offset = self.offset[lang] - if offset >= l_num_samples[lang]: - offset = 0 - limit = offset+self.batchsize - - batch_slice = slice(offset, limit) - batch = index[batch_slice] - batch_labels = labels[batch_slice].toarray() - - post = None - if l_post is not None: - post = torch.FloatTensor(l_post[lang][batch_slice]).cuda() - - bert_emb = None - if l_bert is not None: - bert_emb = torch.FloatTensor(l_bert[lang][batch_slice]).cuda() - - batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length) - - batch = torch.LongTensor(batch).cuda() - target = torch.FloatTensor(batch_labels).cuda() - - self.offset[lang] = limit - - yield batch, post, bert_emb, target, lang + def get_l_pad_index(self): + return {l: index.get_pad_index() for l, index in self.l_index.items()} -def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500): - langs = sorted(l_index.keys()) - nsamples = max([len(l_index[l]) for l in langs]) - nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0) - for b in range(nbatches): - for lang in langs: - index, labels = l_index[lang], llabels[lang] +class Index: + def __init__(self, devel_raw, devel_target, test_raw, test_target, lang): + """ + Monolingual Index, takes care of tokenizing raw data, converting strings to ids, splitting the data into + training and validation. + :param devel_raw: list of strings, list of raw training texts + :param devel_target: + :param test_raw: list of strings, list of raw test texts + :param lang: list, list of languages contained in the dataset + """ + self.lang = lang + self.devel_raw = devel_raw + self.devel_target = devel_target + self.test_raw = test_raw + self.test_target = test_target - if b * batchsize >= len(index): - continue - batch = index[b*batchsize:(b+1)*batchsize] - batch_labels = labels[b*batchsize:(b+1)*batchsize].toarray() - post = None - if l_post is not None: - post = torch.FloatTensor(l_post[lang][b*batchsize:(b+1)*batchsize]).cuda() - batch = pad(batch, pad_index=lpad[lang], max_pad_length=max_pad_length) - batch = torch.LongTensor(batch) - target = torch.FloatTensor(batch_labels) - yield batch.cuda(), post, target.cuda(), lang + def index(self, pretrained_vocabulary, analyzer, vocabulary): + self.word2index = dict(vocabulary) + known_words = set(self.word2index.keys()) + if pretrained_vocabulary is not None: + known_words.update(pretrained_vocabulary) + + self.word2index['UNKTOKEN'] = len(self.word2index) + self.word2index['PADTOKEN'] = len(self.word2index) + self.unk_index = self.word2index['UNKTOKEN'] + self.pad_index = self.word2index['PADTOKEN'] + + # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available) + self.out_of_vocabulary = dict() + self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, + self.out_of_vocabulary) + self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, + self.out_of_vocabulary) + + self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary) + + print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}') + + def get_pad_index(self): + return self.pad_index + + def train_val_split(self, val_prop, max_val, seed): + devel = self.devel_index + target = self.devel_target + devel_raw = self.devel_raw + + val_size = int(min(len(devel) * val_prop, max_val)) + + self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \ + train_test_split( + devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True) + + print( + f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') + + def get_word_list(self): + def extract_word_list(word2index): + return [w for w, i in sorted(word2index.items(), key=lambda x: x[1])] + + word_list = extract_word_list(self.word2index) + word_list += extract_word_list(self.out_of_vocabulary) + return word_list + + def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None): + print(f'[generating embedding matrix for lang {self.lang}]') + + self.wce_range = None + embedding_parts = [] + + if pretrained is not None: + print('\t[pretrained-matrix]') + embedding_parts.append(pretrained) + del pretrained + + if supervised: + print('\t[supervised-matrix]') + F = supervised_embeddings_tfidf(Xtr, Ytr) + num_missing_rows = self.vocabsize - F.shape[0] + F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1])))) + F = torch.from_numpy(F).float() + + offset = 0 + if embedding_parts: + offset = embedding_parts[0].shape[1] + self.wce_range = [offset, offset + F.shape[1]] + embedding_parts.append(F) + + self.embedding_matrix = torch.cat(embedding_parts, dim=1) + + print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]') -def batchify_unlabelled(index_list, batchsize, pad_index, max_pad_length=500): - nsamples = len(index_list) - nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0) - for b in range(nbatches): - batch = index_list[b*batchsize:(b+1)*batchsize] - batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length) - batch = torch.LongTensor(batch) - yield batch.cuda() - - -def clip_gradient(model, clip_value=1e-1): - params = list(filter(lambda p: p.grad is not None, model.parameters())) - for p in params: - p.grad.data.clamp_(-clip_value, clip_value) - - -def predict(logits, classification_type='multilabel'): - if classification_type == 'multilabel': - prediction = torch.sigmoid(logits) > 0.5 - elif classification_type == 'singlelabel': - prediction = torch.argmax(logits, dim=1).view(-1, 1) - else: - print('unknown classification type') - - return prediction.detach().cpu().numpy() - - -def count_parameters(model): - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - -def show_gpu(msg): +def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): """ - ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4 + Index (i.e., replaces word strings with numerical indexes) a list of string documents + :param data: list of string documents + :param vocab: a fixed mapping [str]->[int] of words to indexes + :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained + because they are anyway contained in a pre-trained embedding set that we know in advance) + :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words + :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep + :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that + are not in the original vocab but that are in the known_words + :return: """ - - def query(field): - return (subprocess.check_output( - ['nvidia-smi', f'--query-gpu={field}', - '--format=csv,nounits,noheader'], - encoding='utf-8')) - - def to_int(result): - return int(result.strip().split('\n')[0]) - - used = to_int(query('memory.used')) - total = to_int(query('memory.total')) - pct = used / total - print('\n' + msg, f'{100 * pct:2.1f}% ({used} out of {total})') + indexes = [] + vocabsize = len(vocab) + unk_count = 0 + knw_count = 0 + out_count = 0 + # pbar = tqdm(data, desc=f'indexing') + for text in data: + words = analyzer(text) + index = [] + for word in words: + if word in vocab: + idx = vocab[word] + else: + if word in known_words: + if word not in out_of_vocabulary: + out_of_vocabulary[word] = vocabsize + len(out_of_vocabulary) + idx = out_of_vocabulary[word] + out_count += 1 + else: + idx = unk_index + unk_count += 1 + index.append(idx) + indexes.append(index) + knw_count += len(index) + # pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]' + # f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]') + return indexes -class TfidfVectorizerMultilingual: - - def __init__(self, **kwargs): - self.kwargs = kwargs - - def fit(self, lX, ly=None): - self.langs = sorted(lX.keys()) - self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} - return self - - def transform(self, lX): - return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} - - def fit_transform(self, lX, ly=None): - return self.fit(lX, ly).transform(lX) - - def vocabulary(self, l=None): - if l is None: - return {l: self.vectorizer[l].vocabulary_ for l in self.langs} - else: - return self.vectorizer[l].vocabulary_ - - def get_analyzer(self, l=None): - if l is None: - return {l: self.vectorizer[l].build_analyzer() for l in self.langs} - else: - return self.vectorizer[l].build_analyzer() +def is_true(tensor, device): + return torch.where(tensor == 1, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) -def get_learner(calibrate=False, kernel='linear', C=1): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False) +def is_false(tensor, device): + return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) + + +def define_pad_length(index_list): + lengths = [len(index) for index in index_list] + return int(np.mean(lengths) + np.std(lengths)) + + +def pad(index_list, pad_index, max_pad_length=None): + pad_length = np.max([len(index) for index in index_list]) + if max_pad_length is not None: + pad_length = min(pad_length, max_pad_length) + for i, indexes in enumerate(index_list): + index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length] + return index_list def get_params(optimc=False): @@ -538,20 +466,14 @@ def get_params(optimc=False): return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] -def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru, - gruMUSE, gruWCE, agg, allprob): - _id = '-' - _id_conf = [posteriors, supervised, pretrained, mbert, gru] +def get_method_name(args): + _id = '' + _id_conf = [args.post_embedder, args.wce_embedder, args.muse_embedder, args.bert_embedder, args.gru_embedder] _id_name = ['X', 'W', 'M', 'B', 'G'] for i, conf in enumerate(_id_conf): if conf: _id += _id_name[i] - _id = _id if not gruMUSE else _id + '_muse' - _id = _id if not gruWCE else _id + '_wce' - _id = _id if not agg else _id + '_mean' - _id = _id if not allprob else _id + '_allprob' - - _dataset_path = dataset.split('/')[-1].split('_') + _id = _id if not args.rnn_wce else _id + '_wce' + _dataset_path = args.dataset.split('/')[-1].split('_') dataset_id = _dataset_path[0] + _dataset_path[-1] return _id, dataset_id - diff --git a/src/util/csv_log.py b/src/util/csv_log.py deleted file mode 100755 index 8c11e36..0000000 --- a/src/util/csv_log.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import pandas as pd -pd.set_option('display.max_rows', 500) -pd.set_option('display.max_columns', 500) -pd.set_option('display.width', 1000) - - -class CSVLog: - - def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False): - self.file = file - self.autoflush = autoflush - self.verbose = verbose - if os.path.exists(file) and not overwrite: - self.tell('Loading existing file from {}'.format(file)) - self.df = pd.read_csv(file, sep='\t') - self.columns = sorted(self.df.columns.values.tolist()) - else: - self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file)) - assert columns is not None, 'columns cannot be None' - self.columns = sorted(columns) - dir = os.path.dirname(self.file) - if dir and not os.path.exists(dir): os.makedirs(dir) - self.df = pd.DataFrame(columns=self.columns) - self.defaults={} - - def already_calculated(self, **kwargs): - df = self.df - if df.shape[0]==0: - return False - if len(kwargs)==0: - kwargs = self.defaults - for key,val in kwargs.items(): - df = df.loc[df[key]==val] - if df.shape[0]==0: return False - return True - - def set_default(self, param, value): - self.defaults[param]=value - - def add_row(self, **kwargs): - for key in self.defaults.keys(): - if key not in kwargs: - kwargs[key]=self.defaults[key] - colums = sorted(list(kwargs.keys())) - values = [kwargs[col_i] for col_i in colums] - s = pd.Series(values, index=self.columns) - self.df = self.df.append(s, ignore_index=True) - if self.autoflush: self.flush() - # self.tell(s.to_string()) - self.tell(kwargs) - - def flush(self): - self.df.to_csv(self.file, index=False, sep='\t') - - def tell(self, msg): - if self.verbose: print(msg) - - - diff --git a/src/util/decompositions.py b/src/util/decompositions.py deleted file mode 100644 index 9d14a0c..0000000 --- a/src/util/decompositions.py +++ /dev/null @@ -1,50 +0,0 @@ -from sklearn.decomposition import PCA -import numpy as np -import matplotlib.pyplot as plt - - -def run_pca(dim, X): - """ - :param dim: number of pca components to keep - :param X: dictionary str(lang): matrix - :return: dict lang: reduced matrix - """ - r = dict() - pca = PCA(n_components=dim) - for lang in X.keys(): - r[lang] = pca.fit_transform(X[lang]) - return r - - -def get_optimal_dim(X, embed_type): - """ - :param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised - :param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT) - :return: - """ - _idx = [] - - plt.figure(figsize=(15, 10)) - if embed_type == 'U': - plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance') - else: - plt.title(f'WCE Explained Variance') - plt.xlabel('Number of Components') - plt.ylabel('Variance (%)') - - for lang in X.keys(): - pca = PCA(n_components=X[lang].shape[1]) - pca.fit(X[lang]) - _r = pca.explained_variance_ratio_ - _r = np.cumsum(_r) - plt.plot(_r, label=lang) - for i in range(len(_r) - 1, 1, -1): - delta = _r[i] - _r[i - 1] - if delta > 0: - _idx.append(i) - break - best_n = max(_idx) - plt.axvline(best_n, color='r', label='optimal N') - plt.legend() - plt.show() - return best_n diff --git a/src/util/early_stop.py b/src/util/early_stop.py deleted file mode 100755 index 7d72cde..0000000 --- a/src/util/early_stop.py +++ /dev/null @@ -1,71 +0,0 @@ -#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py -import torch -from transformers import BertForSequenceClassification -from time import time -from util.file import create_if_not_exist -import warnings - -class EarlyStopping: - - def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt', is_bert=False): - # set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters - self.patience_limit = patience - self.patience = patience - self.verbose = verbose - self.best_score = None - self.best_epoch = None - self.stop_time = None - self.checkpoint = checkpoint - self.model = model - self.optimizer = optimizer - self.STOP = False - self.is_bert = is_bert - - def __call__(self, watch_score, epoch): - - if self.STOP: - return - - if self.best_score is None or watch_score >= self.best_score: - self.best_score = watch_score - self.best_epoch = epoch - self.stop_time = time() - if self.checkpoint: - self.print(f'[early-stop] improved, saving model in {self.checkpoint}') - if self.is_bert: - print(f'Serializing Huggingface model...') - create_if_not_exist(self.checkpoint) - self.model.save_pretrained(self.checkpoint) - else: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - torch.save(self.model, self.checkpoint) - # with open(self.checkpoint) - # torch.save({'state_dict': self.model.state_dict(), - # 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint) - else: - self.print(f'[early-stop] improved') - self.patience = self.patience_limit - else: - self.patience -= 1 - if self.patience == 0: - self.STOP = True - self.print(f'[early-stop] patience exhausted') - else: - if self.patience>0: # if negative, then early-stop is ignored - self.print(f'[early-stop] patience={self.patience}') - - def reinit_counter(self): - self.STOP = False - self.patience=self.patience_limit - - def restore_checkpoint(self): - print(f'restoring best model from epoch {self.best_epoch}...') - if self.is_bert: - return BertForSequenceClassification.from_pretrained(self.checkpoint) - else: - return torch.load(self.checkpoint) - - def print(self, msg): - if self.verbose: - print(msg) diff --git a/src/util/embeddings_manager.py b/src/util/embeddings_manager.py new file mode 100644 index 0000000..0526582 --- /dev/null +++ b/src/util/embeddings_manager.py @@ -0,0 +1,104 @@ +from abc import ABC, abstractmethod + +import numpy as np +import torch +from torchtext.vocab import Vectors + +from src.util.SIF_embed import remove_pc + + +class PretrainedEmbeddings(ABC): + + def __init__(self): + super().__init__() + + @abstractmethod + def vocabulary(self): pass + + @abstractmethod + def dim(self): pass + + @classmethod + def reindex(cls, words, word2index): + if isinstance(words, dict): + words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0] + + source_idx, target_idx = [], [] + for i, word in enumerate(words): + if word not in word2index: + continue + j = word2index[word] + source_idx.append(i) + target_idx.append(j) + source_idx = np.asarray(source_idx) + target_idx = np.asarray(target_idx) + return source_idx, target_idx + + +class MuseLoader: + def __init__(self, langs, cache): + self.langs = langs + self.lEmbed = {} + self.lExtracted = {} + for lang in self.langs: + print(f'Loading vectors for {lang}...') + self.lEmbed[lang] = Vectors(f'wiki.multi.{lang}.vec', cache) + + def dim(self): + return self.lEmbed[list(self.lEmbed.keys())[0]].dim + + def vocabulary(self): + return {lang: set(self.lEmbed[lang].stoi.keys()) for lang in self.langs} + + def extract(self, lVoc): + """ + Reindex pretrained loaded embedding in order to match indexes assigned by scikit vectorizer. Such indexes + are consistent with those used by Word Class Embeddings (since we deploy the same vectorizer) + :param lVoc: dict {lang : {word : id}} + :return: torch embedding matrix of extracted embeddings i.e., words in lVoc + """ + for lang, words in lVoc.items(): + print(f'Extracting words for lang {lang}...') + # words = list(zip(*sorted(lVoc[lang].items(), key=lambda x: x[1])))[0] + source_id, target_id = PretrainedEmbeddings.reindex(words, self.lEmbed[lang].stoi) + extraction = torch.zeros((len(words), self.dim())) + extraction[source_id] = self.lEmbed[lang].vectors[target_id] + self.lExtracted[lang] = extraction + return self.lExtracted + + def get_lEmbeddings(self): + return {lang: self.lEmbed[lang].vectors for lang in self.langs} + + +def XdotM(X, M, sif): + E = X.dot(M) + if sif: + E = remove_pc(E, npc=1) + return E + + +def wce_matrix(X, Y): + wce = supervised_embeddings_tfidf(X, Y) + wce = zscores(wce, axis=0) + return wce + + +def supervised_embeddings_tfidf(X, Y): + tfidf_norm = X.sum(axis=0) + tfidf_norm[tfidf_norm == 0] = 1 + F = (X.T).dot(Y) / tfidf_norm.T + return F + + +def zscores(X, axis=0): + """ + scipy.stats.zscores does not avoid division by 0, which can indeed occur + :param X: + :param axis: + :return: + """ + std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None) + mean = np.mean(X, axis=axis) + return (X - mean) / std + + diff --git a/src/util/evaluation.py b/src/util/evaluation.py index 41a2813..45b8b2b 100644 --- a/src/util/evaluation.py +++ b/src/util/evaluation.py @@ -1,102 +1,19 @@ -# from sklearn.externals.joblib import Parallel, delayed from joblib import Parallel, delayed -from util.metrics import * -from sklearn.metrics import f1_score -import numpy as np -import time + +from src.util.metrics import * def evaluation_metrics(y, y_): - if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label - raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') - else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers + if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label + raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') + else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_) -def soft_evaluation_metrics(y, y_): - if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label - raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') - else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers - return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_) - - def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1): - print('evaluation (n_jobs={})'.format(n_jobs)) if n_jobs == 1: return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()} else: langs = list(ly_true.keys()) evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs) return {lang: evals[i] for i, lang in enumerate(langs)} - - -def average_results(l_eval, show=True): - metrics = [] - for lang in l_eval.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if show: - print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - - ave = np.mean(np.array(metrics), axis=0) - if show: - print('Averages: MF1, mF1, MK, mK', ave) - return ave - - -def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, return_time=False): - tinit = time.time() - print('prediction for test') - assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate' - n_jobs = polylingual_method.n_jobs if hasattr(polylingual_method, 'n_jobs') else -1 - - if predictor is None: - predictor = polylingual_method.predict - - metrics = evaluation_metrics - if soft is True: - metrics = soft_evaluation_metrics - ly_ = predictor(lX, ly) - - eval_ = evaluate(ly, ly_, metrics=metrics, n_jobs=n_jobs) - if return_time: - return eval_, time.time()-tinit - else: - return eval_ - - -def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False): - print('prediction for test in a single language') - if predictor is None: - predictor = polylingual_method.predict - - metrics = evaluation_metrics - if soft is True: - metrics = soft_evaluation_metrics - - ly_ = predictor({lang:X}) - return metrics(y, ly_[lang]) - - -def get_binary_counters(polylingual_method, lX, ly, predictor=None): - print('prediction for test') - assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate' - n_jobs = polylingual_method.n_jobs - if predictor is None: - predictor = polylingual_method.predict - ly_ = predictor(lX) - print('evaluation (n_jobs={})'.format(n_jobs)) - if n_jobs == 1: - return {lang: binary_counters(ly[lang], ly_[lang]) for lang in ly.keys()} - else: - langs = list(ly.keys()) - evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs) - return {lang: evals[i] for i, lang in enumerate(langs)} - - -def binary_counters(y, y_): - y = np.reshape(y, (-1)) - assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected' - counters = hard_single_metric_statistics(y, y_) - return counters.tp, counters.tn, counters.fp, counters.fn - diff --git a/src/util/file.py b/src/util/file.py index a3d0a3a..8754f5a 100644 --- a/src/util/file.py +++ b/src/util/file.py @@ -1,7 +1,6 @@ +import urllib from os import listdir, makedirs from os.path import isdir, isfile, join, exists, dirname -#from sklearn.externals.six.moves import urllib -import urllib from pathlib import Path @@ -14,6 +13,7 @@ def download_file(url, archive_filename): urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) print("") + def download_file_if_not_exists(url, archive_path): if exists(archive_path): return makedirs_if_not_exist(dirname(archive_path)) @@ -25,20 +25,26 @@ def ls(dir, typecheck): el.sort() return el + def list_dirs(dir): return ls(dir, typecheck=isdir) + def list_files(dir): return ls(dir, typecheck=isfile) + def makedirs_if_not_exist(path): if not exists(path): makedirs(path) + def create_if_not_exist(path): if not exists(path): makedirs(path) + def get_parent_name(path): return Path(path).parent + def get_file_name(path): return Path(path).name diff --git a/src/util/metrics.py b/src/util/metrics.py index 9f6bc24..7a6079e 100644 --- a/src/util/metrics.py +++ b/src/util/metrics.py @@ -1,24 +1,12 @@ import numpy as np -import numpy as np -from scipy.sparse import lil_matrix, issparse -from sklearn.metrics import f1_score, accuracy_score - -""" -Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. -I.e., when the number of true positives, false positives, and false negatives ammount to 0, all -affected metrices (precision, recall, and thus f1) output 0 in Scikit learn. -We adhere to the common practice of outputting 1 in this case since the classifier has correctly -classified all examples as negatives. -""" - class ContTable: def __init__(self, tp=0, tn=0, fp=0, fn=0): - self.tp=tp - self.tn=tn - self.fp=fp - self.fn=fn + self.tp = tp + self.tn = tn + self.fp = fp + self.fn = fn def get_d(self): return self.tp + self.tn + self.fp + self.fn @@ -57,16 +45,20 @@ class ContTable: def __add__(self, other): return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn) + def accuracy(cell): return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn) + def f1(cell): num = 2.0 * cell.tp den = 2.0 * cell.tp + cell.fp + cell.fn - if den>0: return num / den - #we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative + if den > 0: + return num / den + # we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative return 1.0 + def K(cell): specificity, recall = 0., 0. @@ -85,45 +77,50 @@ def K(cell): else: return specificity + recall - 1. -#computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions -#true_labels and predicted_labels are two vectors of shape (number_documents,) -def hard_single_metric_statistics(true_labels, predicted_labels): - assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels." - nd = len(true_labels) - tp = np.sum(predicted_labels[true_labels==1]) - fp = np.sum(predicted_labels[true_labels == 0]) - fn = np.sum(true_labels[predicted_labels == 0]) - tn = nd - (tp+fp+fn) - return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) -#computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir +# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared +# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions. +def __check_consistency_and_adapt(true_labels, predictions): + if predictions.ndim == 1: + return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1)) + if true_labels.ndim == 1: + return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1), predictions) + if true_labels.shape != predictions.shape: + raise ValueError("True and predicted label matrices shapes are inconsistent %s %s." + % (true_labels.shape, predictions.shape)) + _, nC = true_labels.shape + return true_labels, predictions, nC + + +# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir # probabilitiesfron with respect to the true binary labels -#true_labels and posterior_probabilities are two vectors of shape (number_documents,) +# true_labels and posterior_probabilities are two vectors of shape (number_documents,) def soft_single_metric_statistics(true_labels, posterior_probabilities): - assert len(true_labels)==len(posterior_probabilities), "Format not consistent between true and predicted labels." + assert len(true_labels) == len(posterior_probabilities), "Format not consistent between true and predicted labels." tp = np.sum(posterior_probabilities[true_labels == 1]) fn = np.sum(1. - posterior_probabilities[true_labels == 1]) fp = np.sum(posterior_probabilities[true_labels == 0]) tn = np.sum(1. - posterior_probabilities[true_labels == 0]) return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) -#if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared -#to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions. -def __check_consistency_and_adapt(true_labels, predictions): - if predictions.ndim == 1: - return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1)) - if true_labels.ndim == 1: - return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1),predictions) - if true_labels.shape != predictions.shape: - raise ValueError("True and predicted label matrices shapes are inconsistent %s %s." - % (true_labels.shape, predictions.shape)) - _,nC = true_labels.shape - return true_labels, predictions, nC + +# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions +# true_labels and predicted_labels are two vectors of shape (number_documents,) +def hard_single_metric_statistics(true_labels, predicted_labels): + assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels." + nd = len(true_labels) + tp = np.sum(predicted_labels[true_labels == 1]) + fp = np.sum(predicted_labels[true_labels == 0]) + fn = np.sum(true_labels[predicted_labels == 0]) + tn = nd - (tp+fp+fn) + return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) + def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)]) + def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) @@ -134,123 +131,22 @@ def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_ return metric(accum) -#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def macroF1(true_labels, predicted_labels): - return macro_average(true_labels,predicted_labels, f1) -#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def macroF1(true_labels, predicted_labels): + return macro_average(true_labels, predicted_labels, f1) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format def microF1(true_labels, predicted_labels): return micro_average(true_labels, predicted_labels, f1) -#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def macroK(true_labels, predicted_labels): - return macro_average(true_labels,predicted_labels, K) -#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def macroK(true_labels, predicted_labels): + return macro_average(true_labels, predicted_labels, K) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format def microK(true_labels, predicted_labels): return micro_average(true_labels, predicted_labels, K) - -#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix -#of the same shape containing real values in [0,1] -def smoothmacroF1(true_labels, posterior_probabilities): - return macro_average(true_labels,posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics) - -#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix -#of the same shape containing real values in [0,1] -def smoothmicroF1(true_labels, posterior_probabilities): - return micro_average(true_labels, posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics) - -#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix -#of the same shape containing real values in [0,1] -def smoothmacroK(true_labels, posterior_probabilities): - return macro_average(true_labels,posterior_probabilities, K, metric_statistics=soft_single_metric_statistics) - -#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix -#of the same shape containing real values in [0,1] -def smoothmicroK(true_labels, posterior_probabilities): - return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics) - - - - -""" -Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. -I.e., when the number of true positives, false positives, and false negatives ammount to 0, all -affected metrices (precision, recall, and thus f1) output 0 in Scikit learn. -We adhere to the common practice of outputting 1 in this case since the classifier has correctly -classified all examples as negatives. -""" - -def evaluation(y_true, y_pred, classification_type): - - if classification_type == 'multilabel': - eval_function = multilabel_eval - elif classification_type == 'singlelabel': - eval_function = singlelabel_eval - - Mf1, mf1, accuracy = eval_function(y_true, y_pred) - - return Mf1, mf1, accuracy - - -def multilabel_eval(y, y_): - - tp = y.multiply(y_) - - fn = lil_matrix(y.shape) - true_ones = y==1 - fn[true_ones]=1-tp[true_ones] - - fp = lil_matrix(y.shape) - pred_ones = y_==1 - if pred_ones.nnz>0: - fp[pred_ones]=1-tp[pred_ones] - - #macro-f1 - tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten() - fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten() - fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten() - - pos_pred = tp_macro+fp_macro - pos_true = tp_macro+fn_macro - prec=np.zeros(shape=tp_macro.shape,dtype=float) - rec=np.zeros(shape=tp_macro.shape,dtype=float) - np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0) - np.divide(tp_macro, pos_true, out=rec, where=pos_true>0) - den=prec+rec - - macrof1=np.zeros(shape=tp_macro.shape,dtype=float) - np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0) - macrof1 *=2 - - macrof1[(pos_pred==0)*(pos_true==0)]=1 - macrof1 = np.mean(macrof1) - - #micro-f1 - tp_micro = tp_macro.sum() - fn_micro = fn_macro.sum() - fp_micro = fp_macro.sum() - pos_pred = tp_micro + fp_micro - pos_true = tp_micro + fn_micro - prec = (tp_micro / pos_pred) if pos_pred>0 else 0 - rec = (tp_micro / pos_true) if pos_true>0 else 0 - den = prec+rec - microf1 = 2*prec*rec/den if den>0 else 0 - if pos_pred==pos_true==0: - microf1=1 - - #accuracy - ndecisions = np.multiply(*y.shape) - tn = ndecisions - (tp_micro+fn_micro+fp_micro) - acc = (tp_micro+tn)/ndecisions - - return macrof1,microf1,acc - - -def singlelabel_eval(y, y_): - if issparse(y_): y_ = y_.toarray().flatten() - macrof1 = f1_score(y, y_, average='macro') - microf1 = f1_score(y, y_, average='micro') - acc = accuracy_score(y, y_) - return macrof1,microf1,acc - diff --git a/src/util/parser_options.py b/src/util/parser_options.py deleted file mode 100644 index 0e751bd..0000000 --- a/src/util/parser_options.py +++ /dev/null @@ -1,91 +0,0 @@ -from optparse import OptionParser - -parser = OptionParser(usage="usage: %prog datapath [options]") - -parser.add_option("-d", dest='dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset') - -parser.add_option("-o", "--output", dest="output", - help="Result file", type=str, default='../log/multiModal_log.csv') - -parser.add_option("-X", "--posteriors", dest="posteriors", action='store_true', - help="Add posterior probabilities to the document embedding representation", default=False) - -parser.add_option("-W", "--supervised", dest="supervised", action='store_true', - help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False) - -parser.add_option("-M", "--pretrained", dest="pretrained", action='store_true', - help="Add pretrained MUSE embeddings to the document embedding representation", default=False) - -parser.add_option("-B", "--mbert", dest="mbert", action='store_true', - help="Add multilingual Bert (mBert) document embedding representation", default=False) - -parser.add_option('-G', dest='gruViewGenerator', action='store_true', - help="Add document embedding generated via recurrent net (GRU)", default=False) - -parser.add_option("--l2", dest="l2", action='store_true', - help="Activates l2 normalization as a post-processing for the document embedding views", - default=False) - -parser.add_option("--allprob", dest="allprob", action='store_true', - help="All views are generated as posterior probabilities. This affects the supervised and pretrained" - "embeddings, for which a calibrated classifier is generated, which generates the posteriors", - default=False) - -parser.add_option("--feat-weight", dest="feat_weight", - help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf') - -parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the MUSE polylingual word embeddings", default='../embeddings') - -parser.add_option("-s", "--set_c", dest="set_c", type=float, - help="Set the C parameter", default=1) - -parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimize hyperparameters", default=False) - -parser.add_option("-j", "--n_jobs", dest="n_jobs", type=int, - help="Number of parallel jobs (default is -1, all)", default=-1) - -parser.add_option("-p", "--pca", dest="max_labels_S", type=int, - help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", - default=300) - -parser.add_option("-r", "--remove-pc", dest="sif", action='store_true', - help="Remove common component when computing dot product of word embedding matrices", default=False) - -parser.add_option("-z", "--zscore", dest="zscore", action='store_true', - help="Z-score normalize matrices (WCE and MUSE)", default=False) - -parser.add_option("-a", "--agg", dest="agg", action='store_true', - help="Set aggregation function of the common Z-space to average (Default: concatenation)", - default=False) - -# ------------------------------------------------------------------------------------ - -parser.add_option('--hidden', type=int, default=512, metavar='int', - help='hidden lstm size (default: 512)') - -parser.add_option('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', - help='dropout probability for the supervised matrix (default: 0.5)') - -parser.add_option('--tunable', action='store_true', default=False, - help='pretrained embeddings are tunable from the beginning (default False, i.e., static)') - -parser.add_option('--logfile_gru', dest='logfile_gru', default='../log/log_gru_viewgenerator.csv') - -parser.add_option('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') - -parser.add_option('--force', action='store_true', default=False, - help='do not check if this experiment has already been run') - -parser.add_option('--gruMuse', dest='gruMUSE', action='store_true', default=False, - help='Deploy MUSE embedding as embedding layer of the GRU View Generator') - -parser.add_option('--gruWce', dest='gruWCE', action='store_true', default=False, - help='Deploy WCE embedding as embedding layer of the GRU View Generator') - -parser.add_option('--gru-path', dest='gru_path', default=None, - help='Set the path to a pretrained GRU model (aka, -G view generator)') - -parser.add_option('--bert-path', dest='bert_path', default=None, - help='Set the path to a pretrained mBERT model (aka, -B view generator)') diff --git a/src/util/pl_metrics.py b/src/util/pl_metrics.py new file mode 100644 index 0000000..765a6a2 --- /dev/null +++ b/src/util/pl_metrics.py @@ -0,0 +1,141 @@ +import torch +from pytorch_lightning.metrics import Metric + +from src.util.common import is_false, is_true + + +def _update(pred, target, device): + assert pred.shape == target.shape + # preparing preds and targets for count + true_pred = is_true(pred, device) + false_pred = is_false(pred, device) + true_target = is_true(target, device) + false_target = is_false(target, device) + + tp = torch.sum(true_pred * true_target, dim=0) + tn = torch.sum(false_pred * false_target, dim=0) + fp = torch.sum(true_pred * false_target, dim=0) + fn = torch.sum(false_pred * target, dim=0) + return tp, tn, fp, fn + + +class CustomF1(Metric): + def __init__(self, num_classes, device, average='micro'): + """ + Custom F1 metric. + Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. + I.e., when the number of true positives, false positives, and false negatives amount to 0, all + affected metrics (precision, recall, and thus f1) output 0 in Scikit learn. + We adhere to the common practice of outputting 1 in this case since the classifier has correctly + classified all examples as negatives. + :param num_classes: + :param device: + :param average: + """ + super().__init__() + self.num_classes = num_classes + self.average = average + self.device = 'cuda' if device else 'cpu' + self.add_state('true_positive', default=torch.zeros(self.num_classes)) + self.add_state('true_negative', default=torch.zeros(self.num_classes)) + self.add_state('false_positive', default=torch.zeros(self.num_classes)) + self.add_state('false_negative', default=torch.zeros(self.num_classes)) + + def update(self, preds, target): + true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device) + + self.true_positive += true_positive + self.true_negative += true_negative + self.false_positive += false_positive + self.false_negative += false_negative + + def compute(self): + if self.average == 'micro': + num = 2.0 * self.true_positive.sum() + den = 2.0 * self.true_positive.sum() + self.false_positive.sum() + self.false_negative.sum() + if den > 0: + return (num / den).to(self.device) + return torch.FloatTensor([1.]).to(self.device) + if self.average == 'macro': + class_specific = [] + for i in range(self.num_classes): + class_tp = self.true_positive[i] + class_tn = self.true_negative[i] + class_fp = self.false_positive[i] + class_fn = self.false_negative[i] + num = 2.0 * class_tp + den = 2.0 * class_tp + class_fp + class_fn + if den > 0: + class_specific.append(num / den) + else: + class_specific.append(1.) + average = torch.sum(torch.Tensor(class_specific))/self.num_classes + return average.to(self.device) + + +class CustomK(Metric): + def __init__(self, num_classes, device, average='micro'): + """ + K metric. https://dl.acm.org/doi/10.1145/2808194.2809449 + :param num_classes: + :param device: + :param average: + """ + super().__init__() + self.num_classes = num_classes + self.average = average + self.device = 'cuda' if device else 'cpu' + self.add_state('true_positive', default=torch.zeros(self.num_classes)) + self.add_state('true_negative', default=torch.zeros(self.num_classes)) + self.add_state('false_positive', default=torch.zeros(self.num_classes)) + self.add_state('false_negative', default=torch.zeros(self.num_classes)) + + def update(self, preds, target): + true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device) + + self.true_positive += true_positive + self.true_negative += true_negative + self.false_positive += false_positive + self.false_negative += false_negative + + def compute(self): + if self.average == 'micro': + specificity, recall = 0., 0. + absolute_negatives = self.true_negative.sum() + self.false_positive.sum() + if absolute_negatives != 0: + specificity = self.true_negative.sum()/absolute_negatives + absolute_positives = self.true_positive.sum() + self.false_negative.sum() + if absolute_positives != 0: + recall = self.true_positive.sum()/absolute_positives + + if absolute_positives == 0: + return 2. * specificity - 1 + elif absolute_negatives == 0: + return 2. * recall - 1 + else: + return specificity + recall - 1 + + if self.average == 'macro': + class_specific = [] + for i in range(self.num_classes): + class_tp = self.true_positive[i] + class_tn = self.true_negative[i] + class_fp = self.false_positive[i] + class_fn = self.false_negative[i] + + specificity, recall = 0., 0. + absolute_negatives = class_tn + class_fp + if absolute_negatives != 0: + specificity = class_tn / absolute_negatives + absolute_positives = class_tp + class_fn + if absolute_positives != 0: + recall = class_tp / absolute_positives + + if absolute_positives == 0: + class_specific.append(2. * specificity - 1) + elif absolute_negatives == 0: + class_specific.append(2. * recall - 1) + else: + class_specific.append(specificity + recall - 1) + average = torch.sum(torch.Tensor(class_specific)) / self.num_classes + return average.to(self.device) diff --git a/src/util/results.py b/src/util/results_csv.py similarity index 68% rename from src/util/results.py rename to src/util/results_csv.py index ec66fc1..be0ff84 100644 --- a/src/util/results.py +++ b/src/util/results_csv.py @@ -1,21 +1,21 @@ import os -import pandas as pd -import numpy as np -class PolylingualClassificationResults: +import numpy as np +import pandas as pd + + +class CSVlog: def __init__(self, file, autoflush=True, verbose=False): self.file = file self.columns = ['method', - 'learner', - 'optimp', + 'setting', + 'optimc', 'sif', 'zscore', 'l2', - 'wescaler', - 'pca', - 'id', 'dataset', - 'time', + 'time_tr', + 'time_te', 'lang', 'macrof1', 'microf1', @@ -36,8 +36,11 @@ class PolylingualClassificationResults: def already_calculated(self, id): return (self.df['id'] == id).any() - def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) + def add_row(self, method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang, + macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): + s = pd.Series([method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang, + macrof1, microf1, macrok, microk, notes], + index=self.columns) self.df = self.df.append(s, ignore_index=True) if self.autoflush: self.flush() self.tell(s.to_string()) @@ -46,4 +49,5 @@ class PolylingualClassificationResults: self.df.to_csv(self.file, index=False, sep='\t') def tell(self, msg): - if self.verbose: print(msg) + if self.verbose: + print(msg) diff --git a/src/util_transformers/StandardizeTransformer.py b/src/util/standardizer.py similarity index 85% rename from src/util_transformers/StandardizeTransformer.py rename to src/util/standardizer.py index 06e633e..429bccd 100644 --- a/src/util_transformers/StandardizeTransformer.py +++ b/src/util/standardizer.py @@ -1,15 +1,20 @@ import numpy as np -class StandardizeTransformer: +class StandardizeTransformer: def __init__(self, axis=0, range=None): + """ + + :param axis: + :param range: + """ assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice' self.axis = axis self.yetfit = False self.range = range def fit(self, X): - print('fitting Standardizer...') + print('Applying z-score standardization...') std=np.std(X, axis=self.axis, ddof=1) self.std = np.clip(std, 1e-5, None) self.mean = np.mean(X, axis=self.axis) @@ -28,4 +33,4 @@ class StandardizeTransformer: return (X - self.mean) / self.std def fit_transform(self, X): - return self.fit(X).transform(X) + return self.fit(X).transform(X) \ No newline at end of file diff --git a/src/util/util.py b/src/util/util.py deleted file mode 100644 index 823c82d..0000000 --- a/src/util/util.py +++ /dev/null @@ -1,29 +0,0 @@ -from sklearn.svm import SVC -from tqdm import tqdm -import re -import sys - - -def mask_numbers(data, number_mask='numbermask'): - mask = re.compile(r'\b[0-9][0-9.,-]*\b') - masked = [] - for text in tqdm(data, desc='masking numbers'): - masked.append(mask.sub(number_mask, text)) - return masked - - -def fill_missing_classes(lXtr, lytr): - pass - - -def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') - - -def get_params(dense=False): - if not op.optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' if dense else 'linear' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - diff --git a/src/util_transformers/__init__.py b/src/util_transformers/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/util_transformers/clesa.py b/src/util_transformers/clesa.py deleted file mode 100644 index da17393..0000000 --- a/src/util_transformers/clesa.py +++ /dev/null @@ -1,110 +0,0 @@ -import numpy as np -import sklearn -# from sklearn.externals.joblib import Parallel, delayed -from joblib import Parallel, delayed - -class ESA(object): - """ - Implementation of Explicit Sematic Analysis (ESA) in its mono-lingual version, as a transformer - """ - supported_similarity = ['dot', 'cosine'] - - def __init__(self, similarity='dot', centered=False, post=None): - """ - :param similarity: the similarity measure between documents to be used - :param centered: set to True to subtract the expected similarity due to randomness (experimental) - :param post: any valid sklearn normalization method to be applied to the resulting doc embeddings, or None (default) - """ - assert similarity in self.supported_similarity, ("Similarity method %s is not supported" % similarity) - self.similarity = similarity - self.centered = centered - self.post_processing = post - self.W = None - - def fit(self, W): - """ - :param W: doc-by-term already processed matrix of wikipedia documents - :return: self - """ - self.W = W - return self - - def transform(self, X): - """ - :param X: doc-by-term matrix that is to be transformed into the ESA space. - :return: the matrix X transformed into the ESA space in numpy format - """ - assert self.W is not None, 'transform method called before fit' - - W = self.W - assert X.shape[1] == W.shape[1], ('the feature spaces for X=%s and W=%s do not agree' % (str(X.shape), str(W.shape))) - - if self.similarity in ['dot', 'cosine']: - if self.similarity == 'cosine': - X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True) - W = sklearn.preprocessing.normalize(W, norm='l2', axis=1, copy=True) - - esa = (X.dot(W.T)).toarray() - if self.centered: - pX = (X > 0).sum(1) / float(X.shape[1]) - pW = (W > 0).sum(1) / float(W.shape[1]) - pXpW = np.sqrt(pX.dot(pW.transpose())) - esa = esa - pXpW - - if self.post_processing: - esa = sklearn.preprocessing.normalize(esa, norm=self.post_processing, axis=1, copy=True) - - return esa - - def fit_transform(self, W, X, Y=None): - self.fit(W) - return self.transform(X, Y) - - def dimensionality(self): - return self.W.shape[0] - - - -class CLESA(ESA): - """ - Implementation of Cross-Lingual Explicit Sematic Analysis (ESA) as a transformer - """ - - def __init__(self, similarity='dot', centered=False, post=False, n_jobs=-1): - super(CLESA, self).__init__(similarity, centered, post) - self.lESA = None - self.langs = None - self.n_jobs = n_jobs - - def fit(self, lW): - """ - :param lW: a dictionary of {language: doc-by-term wiki matrix} - :return: self - """ - assert len(np.unique([W.shape[0] for W in lW.values()])) == 1, "inconsistent dimensions across languages" - - self.dimensions = list(lW.values())[0].shape[0] - self.langs = list(lW.keys()) - self.lESA = {lang:ESA(self.similarity, self.centered, self.post_processing).fit(lW[lang]) for lang in self.langs} - return self - - def transform(self, lX): - """ - :param lX: dictionary of {language : doc-by-term matrix} that is to be transformed into the CL-ESA space - :return: a dictionary {language : doc-by-dim matrix} containing the matrix-transformed versions - """ - assert self.lESA is not None, 'transform method called before fit' - assert set(lX.keys()).issubset(set(self.langs)), 'languages in lX are not scope' - langs = list(lX.keys()) - trans = Parallel(n_jobs=self.n_jobs)(delayed(self.lESA[lang].transform)(lX[lang]) for lang in langs) - return {lang:trans[i] for i,lang in enumerate(langs)} - - def fit_transform(self, lW, lX): - return self.fit(lW).transform(lX) - - def languages(self): - return list(self.lESA.keys()) - - - - diff --git a/src/util_transformers/dci.py b/src/util_transformers/dci.py deleted file mode 100644 index 6e84ed9..0000000 --- a/src/util_transformers/dci.py +++ /dev/null @@ -1,154 +0,0 @@ -import numpy as np -from sklearn.preprocessing import normalize -from scipy.sparse import csr_matrix, issparse -from scipy.spatial.distance import cosine -import operator -import functools -import math, sys -# from sklearn.externals.joblib import Parallel, delayed -from joblib import Parallel, delayed - - -class DistributionalCorrespondenceIndexing: - - prob_dcf = ['linear', 'pmi'] - vect_dcf = ['cosine'] - valid_dcf = prob_dcf + vect_dcf - valid_post = ['normal', 'l2', None] - - def __init__(self, dcf='cosine', post='normal', n_jobs=-1): - """ - :param dcf: a distributional correspondence function name (e.g., 'cosine') or a callable f(u,v) which measures - the distribucional correspondence between vectors u and v - :param post: post-processing function to apply to document embeddings. Default is to standardize it into a - normal distribution; other functions allowed are 'l2' or None - """ - if post not in self.valid_post: - raise ValueError("unknown post processing function; valid ones are [%s]" % ', '.join(self.valid_post)) - - if isinstance(dcf, str): - if dcf not in self.valid_dcf: - raise ValueError("unknown dcf; use any in [%s]" % ', '.join(self.valid_dcf)) - self.dcf = getattr(DistributionalCorrespondenceIndexing, dcf) - elif hasattr(dcf, '__call__'): - self.dcf = dcf - else: - raise ValueError('param dcf should either be a valid dcf name in [%s] or a callable comparing two vectors') - #self.dcf = lambda u,v:dcf(u,v) - self.post = post - self.domains = None - self.dFP = None - self.n_jobs = n_jobs - - def fit(self, dU, dP): - """ - :param dU: a dictionary of {domain:dsm_matrix}, where dsm is a document-by-term matrix representing the - distributional semantic model for a specific domain - :param dP: a dictionary {domain:pivot_matrix} where domain is a string representing each domain, - and pivot_matrix has shape (d,p) with d the dimensionality of the distributional space, and p the - number of pivots - :return: self - """ - self.domains = list(dP.keys()) - assert len(np.unique([P.shape[1] for P in dP.values()]))==1, "inconsistent number of pivots across domains" - assert set(dU.keys())==set(self.domains), "inconsistent domains in dU and dP" - assert not [1 for d in self.domains if dU[d].shape[0]!=dP[d].shape[0]], \ - "inconsistent dimensions between distributional and pivot spaces" - self.dimensions = list(dP.values())[0].shape[1] - # embed the feature space from each domain using the pivots of that domain - #self.dFP = {d:self.dcf_dist(dU[d].transpose(), dP[d].transpose()) for d in self.domains} - transformations = Parallel(n_jobs=self.n_jobs)(delayed(self.dcf_dist)(dU[d].transpose(),dP[d].transpose()) for d in self.domains) - self.dFP = {d: transformations[i] for i, d in enumerate(self.domains)} - - def _dom_transform(self, X, FP): - _X = X.dot(FP) - if self.post == 'l2': - _X = normalize(_X, norm='l2', axis=1) - elif self.post == 'normal': - std = np.clip(np.std(_X, axis=0), 1e-5, None) - _X = (_X - np.mean(_X, axis=0)) / std - return _X - - # dX is a dictionary of {domain:dsm}, where dsm (distributional semantic model) is, e.g., a document-by-term csr_matrix - def transform(self, dX): - assert self.dFP is not None, 'transform method called before fit' - assert set(dX.keys()).issubset(self.domains), 'domains in dX are not scope' - domains = list(dX.keys()) - transformations = Parallel(n_jobs=self.n_jobs)(delayed(self._dom_transform)(dX[d], self.dFP[d]) for d in domains) - return {d: transformations[i] for i, d in enumerate(domains)} - - def fit_transform(self, dU, dP, dX): - return self.fit(dU, dP).transform(dX) - - def _prevalence(self, v): - if issparse(v): - return float(v.nnz) / functools.reduce(operator.mul, v.shape, 1) #this works for arrays of any rank - elif isinstance(v, np.ndarray): - return float(v[v>0].size) / v.size - - def linear(self, u, v, D): - tp, fp, fn, tn = self._get_4cellcounters(u, v, D) - den1=tp+fn - den2=tn+fp - tpr = (tp*1./den1) if den1!=0 else 0. - tnr = (tn*1./den2) if den2!=0 else 0. - return tpr + tnr - 1 - - def pmi(self, u, v, D): - tp, fp, fn, tn = self._get_4cellcounters(u, v, D) - - Pxy = tp * 1. / D - Pxny = fp * 1. / D - Pnxy = fn * 1. / D - Px = Pxy + Pxny - Py = Pxy + Pnxy - - if (Px == 0 or Py == 0 or Pxy == 0): - return 0.0 - - score = math.log2(Pxy / (Px * Py)) - if np.isnan(score) or np.isinf(score): - print('NAN') - sys.exit() - return score - - def cosine(self, u, v): - pu = self._prevalence(u) - pv = self._prevalence(v) - return cosine(u, v) - np.sqrt(pu * pv) - - def _get_4cellcounters(self, u, v, D): - """ - :param u: a set of indexes with a non-zero value - :param v: a set of indexes with a non-zero value - :param D: the number of events (i.e., all posible indexes) - :return: the 4-cell contingency values tp, fp, fn, tn) - """ - common=u.intersection(v) - tp = len(common) - fp = len(u) - len(common) - fn = len(v) - len(common) - tn = D - (tp + fp + fn) - return tp, fp, fn, tn - - def dcf_dist(self, U, V): - nU,D = U.shape - nV = V.shape[0] - if issparse(U): U = U.toarray() - if issparse(V): V = V.toarray() - - dists = np.zeros((nU, nV)) - if self.dcf.__name__ in self.prob_dcf: - def hits_index(v): - return set(np.argwhere(v>0).reshape(-1).tolist()) - Vhits = {i:hits_index(V[i]) for i in range(nV)} - for i in range(nU): - Ui_hits = hits_index(U[i]) - for j in range(nV): - dists[i, j] = self.dcf(self, Ui_hits, Vhits[j], D) - else: - for i in range(nU): - for j in range(nV): - dists[i, j] = self.dcf(self, U[i], V[j]) - return dists - diff --git a/src/util_transformers/riboc.py b/src/util_transformers/riboc.py deleted file mode 100644 index 7dfbc42..0000000 --- a/src/util_transformers/riboc.py +++ /dev/null @@ -1,53 +0,0 @@ -import math -import numpy as np -from scipy.sparse import csr_matrix, issparse - -class RandomIndexingBoC(object): - - def __init__(self, latent_dimensions, non_zeros=2): - self.latent_dimensions = latent_dimensions - self.k = non_zeros - self.ri_dict = None - - def fit_transform(self, X): - return self.fit(X).transform(X) - - def fit(self, X): - nF = X.shape[1] - nL = self.latent_dimensions - format = 'csr' if issparse(X) else 'np' - self.ri_dict = _create_random_index_dictionary(shape=(nF, nL), k=self.k, normalized=True, format=format) - return self - - def transform(self, X): - assert X.shape[1] == self.ri_dict.shape[0], 'feature space is inconsistent with the RI dictionary' - if self.ri_dict is None: - raise ValueError("Error: transform method called before fit.") - P = X.dot(self.ri_dict) - if issparse(P): - P.sort_indices() - return P - - -def _create_random_index_dictionary(shape, k, normalized=False, format='csr', positive=False): - assert format in ['csr', 'np'], 'Format should be in "[csr, np]"' - nF, latent_dimensions = shape - print("Creating the random index dictionary for |V|={} with {} dimensions".format(nF,latent_dimensions)) - val = 1.0 if not normalized else 1.0/math.sqrt(k) - #ri_dict = csr_matrix((nF, latent_dimensions)) if format == 'csr' else np.zeros((nF, latent_dimensions)) - ri_dict = np.zeros((nF, latent_dimensions)) - - #TODO: optimize - for t in range(nF): - dims = np.zeros(k, dtype=np.int32) - dims[0] = t % latent_dimensions #the first dimension is choosen in a round-robin manner (prevents gaps) - dims[1:] = np.random.choice(latent_dimensions, size=k-1, replace=False) - values = (np.random.randint(0,2, size=k)*2.0-1.0) * val if not positive else np.array([+val]*k) - ri_dict[t,dims]=values - print("\rprogress [%.2f%% complete]" % (t * 100.0 / nF), end='') - print('\nDone') - - if format=='csr': - ri_dict = csr_matrix(ri_dict) - return ri_dict - diff --git a/src/view_generators.py b/src/view_generators.py new file mode 100644 index 0000000..af4ee8e --- /dev/null +++ b/src/view_generators.py @@ -0,0 +1,388 @@ +""" +This module contains the view generators that take care of computing the view specific document embeddings: + +- VanillaFunGen (-x) cast document representations encoded via TFIDF into posterior probabilities by means of SVM. + +- WordClassGen (-w): generates document representation via Word-Class-Embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + +- MuseGen (-m): generates document representation via MUSE embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + +- RecurrentGen (-g): generates document embedding by means of a Gated Recurrent Units. The model can be + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). + Output dimension is (n_docs, 512). + +- View generator (-b): generates document embedding via mBERT model. +""" +from abc import ABC, abstractmethod +# from time import time + +from pytorch_lightning import Trainer +from pytorch_lightning.loggers import TensorBoardLogger +from pytorch_lightning.callbacks.early_stopping import EarlyStopping +from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor + +from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize +from src.models.learners import * +from src.models.pl_bert import BertModel +from src.models.pl_gru import RecurrentModel +from src.util.common import TfidfVectorizerMultilingual, _normalize, index +from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix +from src.util.file import create_if_not_exist +# TODO: (1) add model checkpointing and loading from checkpoint + training on validation after convergence is reached + + +class ViewGen(ABC): + """ + Abstract class for ViewGenerators implementations. Every ViewGen should implement these three methods in order to + be seamlessly integrated in the overall architecture. + """ + @abstractmethod + def fit(self, lX, ly): + pass + + @abstractmethod + def transform(self, lX): + pass + + @abstractmethod + def fit_transform(self, lX, ly): + pass + + +class VanillaFunGen(ViewGen): + """ + View Generator (x): original funnelling architecture proposed by Moreo, Esuli and + Sebastiani in DOI: https://doi.org/10.1145/3326065 + """ + def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1): + """ + Init Posterior Probabilities embedder (i.e., VanillaFunGen) + :param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to + return posterior probabilities. + :param base_learner: + :param n_jobs: integer, number of concurrent workers + """ + super().__init__() + self.learners = base_learner + self.first_tier_parameters = first_tier_parameters + self.n_jobs = n_jobs + self.doc_projector = NaivePolylingualClassifier(base_learner=self.learners, + parameters=self.first_tier_parameters, n_jobs=self.n_jobs) + self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def fit(self, lX, lY): + print('# Fitting VanillaFunGen (X)...') + lX = self.vectorizer.fit_transform(lX) + self.doc_projector.fit(lX, lY) + return self + + def transform(self, lX): + """ + (1) Vectorize documents; (2) Project them according to the learners SVMs, finally (3) Apply L2 normalization + to the projection and returns it. + :param lX: dict {lang: indexed documents} + :return: document projection to the common latent space. + """ + lX = self.vectorizer.transform(lX) + lZ = self.doc_projector.predict_proba(lX) + lZ = _normalize(lZ, l2=True) + return lZ + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class MuseGen(ViewGen): + """ + View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word + embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. + """ + def __init__(self, muse_dir='../embeddings', n_jobs=-1): + """ + Init the MuseGen. + :param muse_dir: string, path to folder containing muse embeddings + :param n_jobs: int, number of concurrent workers + """ + super().__init__() + self.muse_dir = muse_dir + self.n_jobs = n_jobs + self.langs = None + self.lMuse = None + self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def fit(self, lX, ly): + """ + (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ + print('# Fitting MuseGen (M)...') + self.vectorizer.fit(lX) + self.langs = sorted(lX.keys()) + self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir) + lVoc = self.vectorizer.vocabulary() + self.lMuse = self.lMuse.extract(lVoc) # overwriting lMuse with dict {lang : embed_matrix} with only known words + # TODO: featureweight.fit + return self + + def transform(self, lX): + """ + (1) Vectorize documents; (2) computes the weighted sum of MUSE embeddings found at document level, + finally (3) Apply L2 normalization embedding and returns it. + :param lX: dict {lang: indexed documents} + :return: document projection to the common latent space. + """ + lX = self.vectorizer.transform(lX) + XdotMUSE = Parallel(n_jobs=self.n_jobs)( + delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs) + lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)} + lZ = _normalize(lZ, l2=True) + return lZ + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class WordClassGen(ViewGen): + """ + View Generator (w): generates document representation via Word-Class-Embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + """ + def __init__(self, n_jobs=-1): + """ + Init WordClassGen. + :param n_jobs: int, number of concurrent workers + """ + super().__init__() + self.n_jobs = n_jobs + self.langs = None + self.lWce = None + self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def fit(self, lX, ly): + """ + (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ + print('# Fitting WordClassGen (W)...') + lX = self.vectorizer.fit_transform(lX) + self.langs = sorted(lX.keys()) + wce = Parallel(n_jobs=self.n_jobs)( + delayed(wce_matrix)(lX[lang], ly[lang]) for lang in self.langs) + self.lWce = {l: wce[i] for i, l in enumerate(self.langs)} + # TODO: featureweight.fit() + return self + + def transform(self, lX): + """ + (1) Vectorize documents; (2) computes the weighted sum of Word-Class Embeddings found at document level, + finally (3) Apply L2 normalization embedding and returns it. + :param lX: dict {lang: indexed documents} + :return: document projection to the common latent space. + """ + lX = self.vectorizer.transform(lX) + XdotWce = Parallel(n_jobs=self.n_jobs)( + delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs) + lWce = {l: XdotWce[i] for i, l in enumerate(self.langs)} + lWce = _normalize(lWce, l2=True) + return lWce + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class RecurrentGen(ViewGen): + """ + View Generator (G): generates document embedding by means of a Gated Recurrent Units. The model can be + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). + Output dimension is (n_docs, 512). The training will happen end-to-end. At inference time, the model returns + the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard. + """ + def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50, + gpus=0, n_jobs=-1, patience=20, stored_path=None): + """ + Init RecurrentGen. + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param pretrained_embeddings: dict {lang: tensor of embeddings}, it contains the pretrained embeddings to use + as embedding layer. + :param wce: Bool, whether to deploy Word-Class Embeddings (as proposed by A. Moreo). If True, supervised + embeddings are concatenated to the deployed supervised embeddings. WCE dimensionality is equal to + the number of target classes. + :param batch_size: int, number of samples in a batch. + :param nepochs: int, number of max epochs to train the model. + :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. + :param n_jobs: int, number of concurrent workers (i.e., parallelizing data loading). + :param patience: int, number of epochs with no improvements in val-macroF1 before early stopping. + :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. + """ + super().__init__() + self.multilingualIndex = multilingualIndex + self.langs = multilingualIndex.langs + self.batch_size = batch_size + self.gpus = gpus + self.n_jobs = n_jobs + self.stored_path = stored_path + self.nepochs = nepochs + self.patience = patience + + # EMBEDDINGS to be deployed + self.pretrained = pretrained_embeddings + self.wce = wce + + self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) + self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) + self.model = self._init_model() + self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False) + self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, + patience=self.patience, verbose=False, mode='max') + self.lr_monitor = LearningRateMonitor(logging_interval='epoch') + + def _init_model(self): + if self.stored_path: + lpretrained = self.multilingualIndex.l_embeddings() + return RecurrentModel.load_from_checkpoint(self.stored_path, lPretrained=lpretrained) + else: + lpretrained = self.multilingualIndex.l_embeddings() + langs = self.multilingualIndex.langs + output_size = self.multilingualIndex.get_target_dim() + hidden_size = 512 + lvocab_size = self.multilingualIndex.l_vocabsize() + learnable_length = 0 + return RecurrentModel( + lPretrained=lpretrained, + langs=langs, + output_size=output_size, + hidden_size=hidden_size, + lVocab_size=lvocab_size, + learnable_length=learnable_length, + drop_embedding_range=self.multilingualIndex.sup_range, + drop_embedding_prop=0.5, + gpus=self.gpus + ) + + def fit(self, lX, ly): + """ + Train the Neural Network end-to-end. + lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation + of the Dataset object (RecurrentDataset) in the GfunDataModule class. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ + print('# Fitting RecurrentGen (G)...') + create_if_not_exist(self.logger.save_dir) + recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs) + trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, + callbacks=[self.early_stop_callback, self.lr_monitor], checkpoint_callback=False) + + # vanilla_torch_model = torch.load( + # '../_old_checkpoint/gru_viewgen_-rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle') + # self.model.linear0 = vanilla_torch_model.linear0 + # self.model.linear1 = vanilla_torch_model.linear1 + # self.model.linear2 = vanilla_torch_model.linear2 + # self.model.rnn = vanilla_torch_model.rnn + + trainer.fit(self.model, datamodule=recurrentDataModule) + trainer.test(self.model, datamodule=recurrentDataModule) + return self + + def transform(self, lX): + """ + Project documents to the common latent space. Output dimensionality is 512. + :param lX: dict {lang: indexed documents} + :return: documents projected to the common latent space. + """ + data = {} + for lang in lX.keys(): + indexed = index(data=lX[lang], + vocab=self.multilingualIndex.l_index[lang].word2index, + known_words=set(self.multilingualIndex.l_index[lang].word2index.keys()), + analyzer=self.multilingualIndex.l_vectorizer.get_analyzer(lang), + unk_index=self.multilingualIndex.l_index[lang].unk_index, + out_of_vocabulary=self.multilingualIndex.l_index[lang].out_of_vocabulary) + data[lang] = indexed + l_pad = self.multilingualIndex.l_pad() + self.model.to('cuda' if self.gpus else 'cpu') + self.model.eval() + l_embeds = self.model.encode(data, l_pad, batch_size=256) + return l_embeds + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class BertGen(ViewGen): + """ + View Generator (b): generates document embedding via Bert model. The training happens end-to-end. + At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document + embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard. + """ + def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, patience=5, stored_path=None): + """ + Init Bert model + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param batch_size: int, number of samples per batch. + :param nepochs: int, number of max epochs to train the model. + :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. + :param patience: int, number of epochs with no improvements in val-macroF1 before early stopping. + :param n_jobs: int, number of concurrent workers. + :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. + """ + super().__init__() + self.multilingualIndex = multilingualIndex + self.nepochs = nepochs + self.gpus = gpus + self.batch_size = batch_size + self.n_jobs = n_jobs + self.stored_path = stored_path + self.model = self._init_model() + self.patience = patience + self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False) + self.early_stop_callback = EarlyStopping(monitor='val-macroF1', min_delta=0.00, + patience=self.patience, verbose=False, mode='max') + + def _init_model(self): + output_size = self.multilingualIndex.get_target_dim() + return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus) + + def fit(self, lX, ly): + """ + Train the Neural Network end-to-end. + lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation + of the Dataset object (RecurrentDataset) in the GfunDataModule class. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ + print('# Fitting BertGen (M)...') + create_if_not_exist(self.logger.save_dir) + self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) + bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) + trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus, + logger=self.logger, callbacks=[self.early_stop_callback], checkpoint_callback=False) + trainer.fit(self.model, datamodule=bertDataModule) + trainer.test(self.model, datamodule=bertDataModule) + return self + + def transform(self, lX): + """ + Project documents to the common latent space. Output dimensionality is 768. + :param lX: dict {lang: indexed documents} + :return: documents projected to the common latent space. + """ + data = tokenize(lX, max_len=512) + self.model.to('cuda' if self.gpus else 'cpu') + self.model.eval() + l_embeds = self.model.encode(data, batch_size=64) + return l_embeds + + def fit_transform(self, lX, ly): + # we can assume that we have already indexed data for transform() since we are first calling fit() + return self.fit(lX, ly).transform(lX)