From d1fdad5f6e0017db6ca9dddc56b6c17f44d130b2 Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 27 Jul 2020 11:56:09 +0200 Subject: [PATCH] baseline multilingual Bert --- src/embeddings/embeddings.py | 252 +----- src/embeddings/pretrained.py | 181 +++-- src/embeddings/supervised.py | 27 - src/experiment_scripts/10run_dl_jrc.sh | 11 + src/experiment_scripts/10run_dl_rcv.sh | 11 + src/experiment_scripts/10run_jrc.sh | 12 + .../10run_jrc_combinations.sh | 16 + src/experiment_scripts/10run_rcv.sh | 15 + .../10run_rcv_combinations.sh | 16 + .../run_combinations_jrc.sh | 34 + .../run_combinations_rcv.sh | 31 + src/experiment_scripts/run_dl_jrc.sh | 31 + src/experiment_scripts/run_dl_rcv.sh | 30 + src/experiment_scripts/run_fulljrc_dl.sh | 16 + src/experiment_scripts/run_fullrcv_dl.sh | 20 + src/experiment_scripts/run_traditional_jrc.sh | 45 ++ src/experiment_scripts/run_traditional_rcv.sh | 45 ++ src/experiment_scripts/time_comparison.sh | 6 + src/learning/learners.py | 717 ++++-------------- src/learning/transformers.py | 43 +- src/main_deep.py | 92 --- src/main_deep_learning.py | 53 +- src/main_majorityvoting_cls.py | 65 +- src/main_multimodal_cls.py | 78 +- src/models/lstm_class.py | 4 +- src/new_mbert.py | 355 +++++++++ src/results/results_manager.py | 12 +- src/run_mbert_rcv.sh | 11 + src/util/SIF_embed.py | 2 +- src/util/common.py | 61 +- src/util/early_stop.py | 9 +- src/util/results.py | 23 +- .../StandardizeTransformer.py | 0 .../__init__.py | 0 .../clesa.py | 0 .../dci.py | 0 .../riboc.py | 0 37 files changed, 1212 insertions(+), 1112 deletions(-) create mode 100644 src/experiment_scripts/10run_dl_jrc.sh create mode 100644 src/experiment_scripts/10run_dl_rcv.sh create mode 100644 src/experiment_scripts/10run_jrc.sh create mode 100644 src/experiment_scripts/10run_jrc_combinations.sh create mode 100644 src/experiment_scripts/10run_rcv.sh create mode 100644 src/experiment_scripts/10run_rcv_combinations.sh create mode 100644 src/experiment_scripts/run_combinations_jrc.sh create mode 100644 src/experiment_scripts/run_combinations_rcv.sh create mode 100644 src/experiment_scripts/run_dl_jrc.sh create mode 100644 src/experiment_scripts/run_dl_rcv.sh create mode 100644 src/experiment_scripts/run_fulljrc_dl.sh create mode 100644 src/experiment_scripts/run_fullrcv_dl.sh create mode 100644 src/experiment_scripts/run_traditional_jrc.sh create mode 100644 src/experiment_scripts/run_traditional_rcv.sh create mode 100644 src/experiment_scripts/time_comparison.sh delete mode 100644 src/main_deep.py create mode 100644 src/new_mbert.py create mode 100644 src/run_mbert_rcv.sh rename src/{transformers => util_transformers}/StandardizeTransformer.py (100%) rename src/{transformers => util_transformers}/__init__.py (100%) rename src/{transformers => util_transformers}/clesa.py (100%) rename src/{transformers => util_transformers}/dci.py (100%) rename src/{transformers => util_transformers}/riboc.py (100%) diff --git a/src/embeddings/embeddings.py b/src/embeddings/embeddings.py index e4fdbb3..59a87a1 100644 --- a/src/embeddings/embeddings.py +++ b/src/embeddings/embeddings.py @@ -1,10 +1,7 @@ import os -import pickle from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod -from embeddings.supervised import get_supervised_embeddings -from util.decompositions import * from util.SIF_embed import * @@ -35,122 +32,10 @@ class PretrainedEmbeddings(ABC): return source_idx, target_idx -class WordEmbeddings: - - def __init__(self, lang, we, worddim): - self.lang = lang - self.we = we - self.worddim = worddim - self.dimword = {v:k for k,v in self.worddim.items()} - - @classmethod - def load(cls, basedir, lang, word_preprocessor=None, dopickle=True): - filename = 'wiki.multi.{}.vec'.format(lang) - we_path = os.path.join(basedir, filename) - - if dopickle and os.path.exists(we_path + '.pkl'): - print('loading pkl in {}'.format(we_path + '.pkl')) - (worddim, we) = pickle.load(open(we_path + '.pkl', 'rb')) - else: - word_registry = set() - lines = open(we_path).readlines() - nwords, dims = [int(x) for x in lines[0].split()] - print('reading we of {} dimensions'.format(dims)) - we = np.zeros((nwords, dims), dtype=float) - worddim = {} - index = 0 - for i, line in enumerate(lines[1:]): - if (i + 1) % 100 == 0: - print('\r{}/{}'.format(i + 1, len(lines)), end='') - word, *vals = line.split() - wordp = word_preprocessor(word) if word_preprocessor is not None else word - if wordp: - wordp = wordp[0] - if wordp in word_registry: - print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp)) - elif len(vals) == dims: - worddim[wordp] = index - we[index, :] = np.array(vals).astype(float) - index += 1 - # else: - # print('warning: word <{}> generates an empty string after preprocessing'.format(word)) - we = we[:index] - print('load {} words'.format(index)) - if dopickle: - print('saving...') - pickle.dump((worddim, we), open(we_path + '.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) - - return WordEmbeddings(lang, we, worddim) - - def vocabulary(self): - return set(self.worddim.keys()) - - def __getitem__(self, key): - return self.we[self.worddim[key]] - - def dim(self): - return self.we.shape[1] - - def __contains__(self, key): - return key in self.worddim - - def most_similar(self, word_vect, k): - if word_vect.ndim == 1: - word_vect = word_vect.reshape(1,-1) - assert word_vect.shape[1] == self.dim(), 'inconsistent dimensions' - - sim = np.dot(word_vect,self.we.T) - order = np.argsort(-1*sim, axis=1)[:,:k] - - similar_words = [[self.dimword[order[vi,ki]] for ki in range(k)] for vi in range(word_vect.shape[0])] - sim_scores = sim[:,order] - return similar_words, sim_scores - - def get_vectors(self, wordlist): - indexes = np.array([self.worddim[w] for w in wordlist]) - return self.we[indexes] - - def restrict(self, vocabulary): - # vocabulary is a set of terms to be kept - active_vocabulary = sorted([w for w in vocabulary if w in self.worddim]) - lost = len(vocabulary)-len(active_vocabulary) - if lost > 0: # some terms are missing, so it will be replaced by UNK - print('warning: missing {} terms for lang {}'.format(lost, self.lang)) - self.we = self.get_vectors(active_vocabulary) - assert self.we.shape[0] == len(active_vocabulary) - self.dimword={i:w for i,w in enumerate(active_vocabulary)} - self.worddim={w:i for i,w in enumerate(active_vocabulary)} - return self - - @classmethod - def load_poly(cls, basedir, langs, lang_vocabularies=None, word_preprocessor=None): - if lang_vocabularies is None: - return cls.merge([cls.load(basedir,lang, word_preprocessor) for lang in langs]) - else: - # assert all([l in lang_vocabularies for l in langs]), 'missing vocabulary for some languages' - return cls.merge([cls.load(basedir, lang, word_preprocessor).restrict(lang_vocabularies[lang]) for lang in langs]) - - @classmethod - def merge(cls, we_list): - assert all([isinstance(we, WordEmbeddings) for we in we_list]), \ - 'instances of {} expected'.format(WordEmbeddings.__name__) - - polywe = [] - worddim = {} - offset = 0 - for we in we_list: - polywe.append(we.we) - worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()}) - offset = len(worddim) - polywe = np.vstack(polywe) - - return WordEmbeddings(lang='poly', we=polywe, worddim=worddim) - - class FastTextWikiNews(Vectors): url_base = 'Cant auto-download MUSE embeddings' - path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec' + path = '../embeddings/wiki.multi.{}.vec' _name = '/wiki.multi.{}.vec' def __init__(self, cache, language="en", **kwargs): @@ -159,42 +44,13 @@ class FastTextWikiNews(Vectors): super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) - -class EmbeddingsAligned(Vectors): - - def __init__(self, type, path, lang, voc): - # todo - rewrite as relative path - self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' - self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT' - self.path = path + self.name.format(lang) - assert os.path.exists(path), f'pre-trained vectors not found in {path}' - super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path) - self.vectors = self.extract(voc) - - def vocabulary(self): - return set(self.stoi.keys()) - - def extract(self, words): - source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi) - extraction = torch.zeros((len(words), self.dim)) - extraction[source_idx] = self.vectors[target_idx] - return extraction - - def reduce(self, dim): - pca = PCA(n_components=dim) - self.vectors = pca.fit_transform(self.vectors) - return - - class FastTextMUSE(PretrainedEmbeddings): - def __init__(self, path, lang, limit=None): super().__init__() print(f'Loading fastText pretrained vectors for language {lang} from {path}') assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') self.embed = FastTextWikiNews(path, lang, max_vectors=limit) - def vocabulary(self): return set(self.embed.stoi.keys()) @@ -204,114 +60,8 @@ class FastTextMUSE(PretrainedEmbeddings): def extract(self, words): source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) extraction = torch.zeros((len(words), self.dim())) - # extraction = torch.empty(len(words), self.dim()).normal_(0, 1) extraction[source_idx] = self.embed.vectors[target_idx] return extraction -class StorageEmbeddings: - def __init__(self, path): - self.path = path - self.lang_U = dict() - self.lang_S = dict() - def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300): - for lang in docs.keys(): - print(f'# [unsupervised-matrix {type}] for {lang}') - voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0]) - self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors - print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n') - nC = self.lang_U[lang].shape[1] - if max_label_space == 0: - print(f'Computing optimal number of PCA components along matrices U') - optimal_n = get_optimal_dim(self.lang_U, 'U') - self.lang_U = run_pca(optimal_n, self.lang_U) - elif max_label_space < nC: - print(f'Applying PCA to unsupervised matrix U') - self.lang_U = run_pca(max_label_space, self.lang_U) - - return - - def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc): - only_well_represented_C = False # TODO testing - if only_well_represented_C: - labels = labels.copy() - min_prevalence = 0 - print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...') - langs = list(docs.keys()) - well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs]) - for lang in langs: - labels[lang] = labels[lang][:, well_repr_cats] - print(f'Target number reduced to: {labels[lang].shape[1]}\n') - - for lang in docs.keys(): # compute supervised matrices S - then apply PCA - print(f'# [supervised-matrix] for {lang}') - self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], - reduction, max_label_space, voc[lang], lang) - nC = self.lang_S[lang].shape[1] - print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') - - if max_label_space == 0: # looking for best n_components analyzing explained_variance_ratio - print(f'Computing optimal number of PCA components along matrices S') - optimal_n = get_optimal_dim(self.lang_S, 'S') - print(f'Applying PCA(n_components={optimal_n})') - self.lang_S = run_pca(optimal_n, self.lang_S) - elif max_label_space == -1: # applying pca to the verticals stacked matrix of WCE embeddings - print(f'Computing PCA on vertical stacked WCE embeddings') - languages = self.lang_S.keys() - _temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) # stacking WCE vertically - stacked_pca = PCA(n_components=_temp_stack.shape[1]) - stacked_pca.fit(_temp_stack) - best_n = None - _r = stacked_pca.explained_variance_ratio_ - _r = np.cumsum(_r) - plt.plot(_r, label='Stacked Supervised') - for i in range(len(_r) - 1, 1, -1): - delta = _r[i] - _r[i - 1] - if delta > 0: - best_n = i - break - plt.show() - stacked_pca = PCA(n_components=best_n) - stacked_pca.fit(_temp_stack) - print(f'Applying PCA(n_components={i}') - for lang in languages: - self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang]) - elif max_label_space <= nC: # less or equal in order to reduce it to the same initial dimension - print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})') - self.lang_S = run_pca(max_label_space, self.lang_S) - - return - - def SIF_embeddings(self): - print('todo') # TODO - - def _concatenate_embeddings(self, docs): - _r = dict() - for lang in self.lang_U.keys(): - _r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang]))) - return _r - - def fit(self, config, docs, vocs, labels): - if config['unsupervised']: - self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised']) - if config['supervised']: - self._add_embeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs) - return self - - def predict(self, config, docs): - if config['supervised'] and config['unsupervised']: - return self._concatenate_embeddings(docs) - # todo testing applying pca to hstack muse + wce - # _reduced = self._concatenate_embeddings(docs) - # return run_pca(300, _reduced) - elif config['supervised']: - _r = dict() - for lang in docs.keys(): - _r[lang] = docs[lang].dot(self.lang_S[lang]) - else: - _r = dict() - for lang in docs.keys(): - _r[lang] = docs[lang].dot(self.lang_U[lang]) - - return _r diff --git a/src/embeddings/pretrained.py b/src/embeddings/pretrained.py index def5be0..026823e 100644 --- a/src/embeddings/pretrained.py +++ b/src/embeddings/pretrained.py @@ -1,103 +1,102 @@ from abc import ABC, abstractmethod import torch, torchtext -import gensim -import os +# import gensim +# import os import numpy as np -class KeyedVectors: - - def __init__(self, word2index, weights): - assert len(word2index)==weights.shape[0], 'wrong number of dimensions' - index2word = {i:w for w,i in word2index.items()} - assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed' - self.word2index = word2index - self.index2word = index2word - self.weights = weights - - def extract(self, words): - dim = self.weights.shape[1] - v_size = len(words) - - source_idx, target_idx = [], [] - for i,word in enumerate(words): - if word not in self.word2index: continue - j = self.word2index[word] - source_idx.append(i) - target_idx.append(j) - - extraction = np.zeros((v_size, dim)) - extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)] - - return extraction +# class KeyedVectors: +# +# def __init__(self, word2index, weights): +# assert len(word2index)==weights.shape[0], 'wrong number of dimensions' +# index2word = {i:w for w,i in word2index.items()} +# assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed' +# self.word2index = word2index +# self.index2word = index2word +# self.weights = weights +# +# def extract(self, words): +# dim = self.weights.shape[1] +# v_size = len(words) +# +# source_idx, target_idx = [], [] +# for i,word in enumerate(words): +# if word not in self.word2index: continue +# j = self.word2index[word] +# source_idx.append(i) +# target_idx.append(j) +# +# extraction = np.zeros((v_size, dim)) +# extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)] +# +# return extraction - -class PretrainedEmbeddings(ABC): - - def __init__(self): - super().__init__() - - @abstractmethod - def vocabulary(self): pass - - @abstractmethod - def dim(self): pass - - @classmethod - def reindex(cls, words, word2index): - source_idx, target_idx = [], [] - for i, word in enumerate(words): - if word not in word2index: continue - j = word2index[word] - source_idx.append(i) - target_idx.append(j) - source_idx = np.asarray(source_idx) - target_idx = np.asarray(target_idx) - return source_idx, target_idx +# class PretrainedEmbeddings(ABC): +# +# def __init__(self): +# super().__init__() +# +# @abstractmethod +# def vocabulary(self): pass +# +# @abstractmethod +# def dim(self): pass +# +# @classmethod +# def reindex(cls, words, word2index): +# source_idx, target_idx = [], [] +# for i, word in enumerate(words): +# if word not in word2index: continue +# j = word2index[word] +# source_idx.append(i) +# target_idx.append(j) +# source_idx = np.asarray(source_idx) +# target_idx = np.asarray(target_idx) +# return source_idx, target_idx -class GloVe(PretrainedEmbeddings): - - def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None): - super().__init__() - print(f'Loading GloVe pretrained vectors from torchtext') - self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors) - print('Done') - - def vocabulary(self): - return set(self.embed.stoi.keys()) - - def dim(self): - return self.embed.dim - - def extract(self, words): - source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) - extraction = torch.zeros((len(words), self.dim())) - extraction[source_idx] = self.embed.vectors[target_idx] - return extraction +# class GloVe(PretrainedEmbeddings): +# +# def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None): +# super().__init__() +# print(f'Loading GloVe pretrained vectors from torchtext') +# self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors) +# print('Done') +# +# def vocabulary(self): +# return set(self.embed.stoi.keys()) +# +# def dim(self): +# return self.embed.dim +# +# def extract(self, words): +# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) +# extraction = torch.zeros((len(words), self.dim())) +# extraction[source_idx] = self.embed.vectors[target_idx] +# return extraction -class Word2Vec(PretrainedEmbeddings): - - def __init__(self, path, limit=None): - super().__init__() - print(f'Loading word2vec pretrained vectors from {path}') - assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}') - self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit) - self.word2index={w:i for i,w in enumerate(self.embed.index2word)} - print('Done') - - def vocabulary(self): - return set(self.word2index.keys()) - - def dim(self): - return self.embed.vector_size - - def extract(self, words): - source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index) - extraction = np.zeros((len(words), self.dim())) - extraction[source_idx] = self.embed.vectors[target_idx] - extraction = torch.from_numpy(extraction).float() - return extraction +# class Word2Vec(PretrainedEmbeddings): +# +# def __init__(self, path, limit=None): +# super().__init__() +# print(f'Loading word2vec pretrained vectors from {path}') +# assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}') +# self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit) +# self.word2index={w:i for i,w in enumerate(self.embed.index2word)} +# print('Done') +# +# def vocabulary(self): +# return set(self.word2index.keys()) +# +# def dim(self): +# return self.embed.vector_size +# +# def extract(self, words): +# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index) +# extraction = np.zeros((len(words), self.dim())) +# extraction[source_idx] = self.embed.vectors[target_idx] +# extraction = torch.from_numpy(extraction).float() +# return extraction diff --git a/src/embeddings/supervised.py b/src/embeddings/supervised.py index b1faa2d..f84793e 100755 --- a/src/embeddings/supervised.py +++ b/src/embeddings/supervised.py @@ -1,7 +1,5 @@ from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square import numpy as np -# from sklearn.decomposition import PCA -# from sklearn.manifold import TSNE def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur @@ -69,31 +67,6 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la return F - # if nC >= max_label_space: - # if reduction == 'PCA': - # if max_label_space == 0: - # pca = PCA(n_components=Y.shape[1]) - # pca = pca.fit(F) - # return pca.explained_variance_ratio_ - # - # print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - # f'Applying PCA(n_components={max_label_space})') - # pca = PCA(n_components=max_label_space) - # F = pca.fit_transform(F) - # elif reduction == 'TSNE': - # print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - # f'Applying t-SNE(n_components={max_label_space})') - # tsne = TSNE(n_components=max_label_space) - # F = tsne.fit_transform(F) - # elif reduction == 'tSVD': - # print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - # f'Applying truncatedSVD(n_components={max_label_space})') - # tSVD = TruncatedSVD(n_components=max_label_space) - # F = tSVD.fit_transform(F) - # - # return F - - diff --git a/src/experiment_scripts/10run_dl_jrc.sh b/src/experiment_scripts/10run_dl_jrc.sh new file mode 100644 index 0000000..ce04aa8 --- /dev/null +++ b/src/experiment_scripts/10run_dl_jrc.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run +logfile=../log/log10run_dl_jrc.csv + +runs='0 1 2 3 4 5 6 7 8 9' +for run in $runs +do + dataset=$dataset_path$run.pickle + python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 +done \ No newline at end of file diff --git a/src/experiment_scripts/10run_dl_rcv.sh b/src/experiment_scripts/10run_dl_rcv.sh new file mode 100644 index 0000000..51ca64b --- /dev/null +++ b/src/experiment_scripts/10run_dl_rcv.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run +logfile=../log/log10run_dl_rcv.csv + +runs='0 1 2 3 4 5 6 7 8 9' +for run in $runs +do + dataset=$dataset_path$run.pickle + python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 +done diff --git a/src/experiment_scripts/10run_jrc.sh b/src/experiment_scripts/10run_jrc.sh new file mode 100644 index 0000000..37e3333 --- /dev/null +++ b/src/experiment_scripts/10run_jrc.sh @@ -0,0 +1,12 @@ +dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle +logfile=./results/10run_jrc_final_results.csv + +runs='0 1 2 3 4 5 6 7 8 9' +for run in $runs +do + dataset=$dataset_path$run.pickle + python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2 + python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2 + python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2 + +done diff --git a/src/experiment_scripts/10run_jrc_combinations.sh b/src/experiment_scripts/10run_jrc_combinations.sh new file mode 100644 index 0000000..904e7e9 --- /dev/null +++ b/src/experiment_scripts/10run_jrc_combinations.sh @@ -0,0 +1,16 @@ +dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run +logfile=./results/funnelling_10run_jrc_CIKM.csv + +runs='6 7 8 9' #0 1 2 3 4 5 +for run in $runs +do + dataset=$dataset_path$run.pickle + #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated (done up to run5) + python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated + #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob + #python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob + #python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob + #python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob + #python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2 + #python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2 +done \ No newline at end of file diff --git a/src/experiment_scripts/10run_rcv.sh b/src/experiment_scripts/10run_rcv.sh new file mode 100644 index 0000000..9d49f94 --- /dev/null +++ b/src/experiment_scripts/10run_rcv.sh @@ -0,0 +1,15 @@ +dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run +logfile=./results/10run_rcv_final_results.csv + +runs='0 1 2 3 4 5 6 7 8 9' + +for run in $runs +do + dataset=$dataset_path$run.pickle + python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2 + python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2 + python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2 + +done + + diff --git a/src/experiment_scripts/10run_rcv_combinations.sh b/src/experiment_scripts/10run_rcv_combinations.sh new file mode 100644 index 0000000..e993327 --- /dev/null +++ b/src/experiment_scripts/10run_rcv_combinations.sh @@ -0,0 +1,16 @@ +dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run +logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv + +runs='0 1 2 3 4 5 6 7 8 9' +for run in $runs +do + dataset=$dataset_path$run.pickle + #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated + python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated + #python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob + #python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob + #python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob + #python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob + #python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2 + #python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2 +done \ No newline at end of file diff --git a/src/experiment_scripts/run_combinations_jrc.sh b/src/experiment_scripts/run_combinations_jrc.sh new file mode 100644 index 0000000..fa7f0d1 --- /dev/null +++ b/src/experiment_scripts/run_combinations_jrc.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle +logfile=./results/final_combinations_jrc.csv +#A.2: ensembling feature sets (combinations of posteriors, wce, muse): +# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc... +# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...) + +# aggregation=concatenation +#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 +#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 +#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 +#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 +# + +##FeatureSetToPosteriors (aggregation mean) +python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob +python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob +python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob +python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob + +##FeatureSetToPosteriors +#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob +#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob +#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob +#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob + +#MajorityVoting +#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r +#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r +#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r +#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r + + diff --git a/src/experiment_scripts/run_combinations_rcv.sh b/src/experiment_scripts/run_combinations_rcv.sh new file mode 100644 index 0000000..1d48f9c --- /dev/null +++ b/src/experiment_scripts/run_combinations_rcv.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle +logfile=./results/final_combinations_rcv.csv +#A.2: ensembling feature sets (combinations of posteriors, wce, muse): +# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc... +# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...) + +# aggregation=concatenation +#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 +#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 +#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 +#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 +# +##FeatureSetToPosteriors (aggregation mean) +python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob +python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob +python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob +python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob + +##FeatureSetToPosteriors +#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob +#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob +#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob +#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob + +#MajorityVoting +#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r +#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r +#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r +#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r \ No newline at end of file diff --git a/src/experiment_scripts/run_dl_jrc.sh b/src/experiment_scripts/run_dl_jrc.sh new file mode 100644 index 0000000..1d28e83 --- /dev/null +++ b/src/experiment_scripts/run_dl_jrc.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +logfile=../log/log_pre_jrc.csv +dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle +python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20 + +python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20 + +python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20 + +python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20 +python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20 \ No newline at end of file diff --git a/src/experiment_scripts/run_dl_rcv.sh b/src/experiment_scripts/run_dl_rcv.sh new file mode 100644 index 0000000..4782887 --- /dev/null +++ b/src/experiment_scripts/run_dl_rcv.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle +python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20 + +python main_deep_learning.py $dataset --supervised --plotmode --test-each 20 +python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20 +python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20 +python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20 + +python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20 + +python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20 +python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20 \ No newline at end of file diff --git a/src/experiment_scripts/run_fulljrc_dl.sh b/src/experiment_scripts/run_fulljrc_dl.sh new file mode 100644 index 0000000..4d5eeaa --- /dev/null +++ b/src/experiment_scripts/run_fulljrc_dl.sh @@ -0,0 +1,16 @@ +dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle +seeds='5' #2 3 4 5 6 7 8 9 10' +for seed in $seeds +do + #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed + #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed + python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed --force + + #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained --seed $seed + #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed + + #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained --seed $seed + #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed + #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed --force + +done \ No newline at end of file diff --git a/src/experiment_scripts/run_fullrcv_dl.sh b/src/experiment_scripts/run_fullrcv_dl.sh new file mode 100644 index 0000000..5894aef --- /dev/null +++ b/src/experiment_scripts/run_fullrcv_dl.sh @@ -0,0 +1,20 @@ +dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle +seeds='1 ' #2 3 4 5' # 6 7 8 9 10' +for seed in $seeds +do + #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed + #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed + python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200 + + + + #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained --seed $seed + #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed + + #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained --seed $seed + #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed + +# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed +# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200 + #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed +done \ No newline at end of file diff --git a/src/experiment_scripts/run_traditional_jrc.sh b/src/experiment_scripts/run_traditional_jrc.sh new file mode 100644 index 0000000..460c9e8 --- /dev/null +++ b/src/experiment_scripts/run_traditional_jrc.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle + +######################################## POSTERIORS + # Posteriors +python main_multimodal_cls.py $dataset -P # + zscore +python main_multimodal_cls.py $dataset -P -z # +l2norm +python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight + + +######################################### WCE + #WCE supervised +python main_multimodal_cls.py $dataset -S # + zscore +python main_multimodal_cls.py $dataset -S -z # +l2norm +python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight +python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA + +python main_multimodal_cls.py $dataset -S -z -p 250 --l2 # +feature weight + pca +python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 # + SIF + +python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight +python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig +python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig # + pca +python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig + + +python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi +python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi +python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi +python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi + +################################# MUSE + + # MUSE unsupervised +python main_multimodal_cls.py $dataset -U # + zscore +python main_multimodal_cls.py $dataset -U -z # +l2norm +python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight +python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA + +python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca +python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig + +python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi +python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi diff --git a/src/experiment_scripts/run_traditional_rcv.sh b/src/experiment_scripts/run_traditional_rcv.sh new file mode 100644 index 0000000..0dcfa2c --- /dev/null +++ b/src/experiment_scripts/run_traditional_rcv.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle + +######################################## POSTERIORS + # Posteriors +python main_multimodal_cls.py $dataset -P # + zscore +python main_multimodal_cls.py $dataset -P -z # +l2norm +python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight + + +######################################### WCE + #WCE supervised +python main_multimodal_cls.py $dataset -S # + zscore +python main_multimodal_cls.py $dataset -S -z # +l2norm +python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight +python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA + +python main_multimodal_cls.py $dataset -S -z -p 50 --l2 # +feature weight + pca +python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 # + SIF + +python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight +python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig +python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig # + pca +python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig + + +python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi +python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi +python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi +python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi + +################################# MUSE + + # MUSE unsupervised +python main_multimodal_cls.py $dataset -U # + zscore +python main_multimodal_cls.py $dataset -U -z # +l2norm +python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight +python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA + +python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca +python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig + +python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi +python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi diff --git a/src/experiment_scripts/time_comparison.sh b/src/experiment_scripts/time_comparison.sh new file mode 100644 index 0000000..60e1c25 --- /dev/null +++ b/src/experiment_scripts/time_comparison.sh @@ -0,0 +1,6 @@ +dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle +seeds='1 2 3 4 5 6 7 8 9 10' +for seed in $seeds +do + python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised --nepochs 50 --seed $seed + done \ No newline at end of file diff --git a/src/learning/learners.py b/src/learning/learners.py index 95f8c2b..0559416 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -1,15 +1,15 @@ import numpy as np import time -from embeddings.embeddings import WordEmbeddings, StorageEmbeddings +# from embeddings.embeddings import WordEmbeddings, StorageEmbeddings from scipy.sparse import issparse from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import GridSearchCV -from sklearn.model_selection import KFold +# from sklearn.model_selection import KFold from joblib import Parallel, delayed -from sklearn.feature_extraction.text import TfidfVectorizer -from transformers.StandardizeTransformer import StandardizeTransformer -from sklearn.decomposition import PCA -from models.cnn_class_bu import CNN_pdr +# from sklearn.feature_extraction.text import TfidfVectorizer +# from util_transformers.StandardizeTransformer import StandardizeTransformer +# from sklearn.decomposition import PCA +# from models.cnn_class_bu import CNN_pdr def _sort_if_sparse(X): @@ -40,154 +40,154 @@ class TrivialRejector: def best_params(self): return {} -class FunnellingPolylingualClassifier: - """ - This classifier projects each document d into a language-independent feature space where each dimension fi is the - decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l; - then trains one single classifier for all documents in this space, irrespective of their originary language - """ - def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1, - calmode='cal', n_jobs=-1): - """ - :param first_tier_learner: the learner used in the first-tier level - :param meta_learner: the learner used in the second-tier level - :param first_tier_parameters: parameters for the learner in the doc_projector - :param meta_parameters: parameters for the learner in the z-space - :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and - :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or - :param n_jobs: number of parallel threads - 'sigmoid' to use the sigmoid of the decision_function - projects the data before training the final classifier; if greater than one, the training set is split in as - many folds as indicated, and the projected space is composed by concatenating each fold prediction based on - models trained on the remaining folds. This should increase the generality of the space to unseen data. - """ - assert folded_projections>0, "positive number of folds expected" - assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode' - assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True' - - self.fist_tier_learner = first_tier_learner - self.meta_learner = meta_learner - self.fist_tier_parameters=first_tier_parameters - self.meta_parameters = meta_parameters - self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs) - self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs) - self.folded_projections = folded_projections - self.n_jobs = n_jobs - self.calmode = calmode - - def _projection(self, doc_projector, lX): - """ - Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or - decision_function if otherwise - :param doc_projector: the document projector (a NaivePolylingualClassifier) - :param lX: {lang:matrix} to train - :return: the projection, applied with predict_proba or decision_function - """ - if self.calmode=='cal': - return doc_projector.predict_proba(lX) - else: - l_decision_scores = doc_projector.decision_function(lX) - if self.calmode=='sigmoid': - def sigmoid(x): return 1 / (1 + np.exp(-x)) - for lang in l_decision_scores.keys(): - l_decision_scores[lang] = sigmoid(l_decision_scores[lang]) - return l_decision_scores - - def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None): - """ - Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of - decision scores (if otherwise). This space is here named zspace. - :param lXtr: {lang:matrix} to train - :param lYtr: {lang:labels} to train - :param lXproj: {lang:matrix} to project (if None, then projects the lXtr) - :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked) - :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific - models trained on lXtr, and the lYproj labels stacked consistently - """ - repair_empty_folds = True - if lXproj is None and lYproj is None: - lXproj, lYproj = lXtr, lYtr - repair_empty_folds = False - - print('fitting the projectors... {}'.format(lXtr.keys())) - self.doc_projector.fit(lXtr, lYtr) - - print('projecting the documents') - langs = list(lXtr.keys()) - lZ = self._projection(self.doc_projector, lXproj) - - # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version - empty_categories = self.doc_projector.empty_categories - lZ_bu = self._projection(self.doc_projector_bu, lXproj) - - for lang in langs: - repair = empty_categories[lang] - lZ[lang][:,repair] = lZ_bu[lang][:,repair] - - Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space - zy = np.vstack([lYproj[lang] for lang in langs]) - return Z, zy - - def _get_zspace_folds(self, lX, ly): - self.doc_projector_bu.fit(lX, ly) - - print('split of {} folds'.format(self.folded_projections)) - skf = KFold(n_splits=self.folded_projections, shuffle=True) - - Z, zy = [], [] - lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()} - for fold in range(self.folded_projections): - print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections)) - lfoldXtr, lfoldYtr = {}, {} - lfoldXte, lfoldYte = {}, {} - for lang in lX.keys(): - train, test = lfold[lang][fold] - lfoldXtr[lang] = lX[lang][train] - lfoldYtr[lang] = ly[lang][train] - lfoldXte[lang] = lX[lang][test] - lfoldYte[lang] = ly[lang][test] - Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte) - Z.append(Zfold) - zy.append(zYfold) - # compose the Z-space as the union of all folded predictions - Z = np.vstack(Z) - zy = np.vstack(zy) - # refit the document projector with all examples to have a more reliable projector for test data - self.doc_projector = self.doc_projector_bu - return Z, zy - - def fit(self, lX, ly, lZ=None, lzy=None): - tinit = time.time() - Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly) - - #experimental: adds the posterior probabilities (computed outside) to the meta-classifier - if lZ is not None and lzy is not None: - zlangs = list(lZ.keys()) - Z = np.vstack((Z, *[lZ[l] for l in zlangs])) - zy = np.vstack((zy, *[lzy[l] for l in zlangs])) - - print('fitting the Z-space of shape={}'.format(Z.shape)) - self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs) - self.model.fit(Z, zy) - self.time = time.time() - tinit - - return self - - def predict(self, lX, lZ=None): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation - :return: a dictionary of predictions - """ - lZ_ = self._projection(self.doc_projector, lX) - if lZ is not None: - lZ_ = {**lZ_, **lZ} - return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs) - - def best_params(self): - params = self.doc_projector.best_params() - params['meta'] = self.model.best_params() - return params +# class FunnellingPolylingualClassifier: +# """ +# This classifier projects each document d into a language-independent feature space where each dimension fi is the +# decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l; +# then trains one single classifier for all documents in this space, irrespective of their originary language +# """ +# def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1, +# calmode='cal', n_jobs=-1): +# """ +# :param first_tier_learner: the learner used in the first-tier level +# :param meta_learner: the learner used in the second-tier level +# :param first_tier_parameters: parameters for the learner in the doc_projector +# :param meta_parameters: parameters for the learner in the z-space +# :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and +# :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or +# :param n_jobs: number of parallel threads +# 'sigmoid' to use the sigmoid of the decision_function +# projects the data before training the final classifier; if greater than one, the training set is split in as +# many folds as indicated, and the projected space is composed by concatenating each fold prediction based on +# models trained on the remaining folds. This should increase the generality of the space to unseen data. +# """ +# assert folded_projections>0, "positive number of folds expected" +# assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode' +# assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True' +# +# self.fist_tier_learner = first_tier_learner +# self.meta_learner = meta_learner +# self.fist_tier_parameters=first_tier_parameters +# self.meta_parameters = meta_parameters +# self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs) +# self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs) +# self.folded_projections = folded_projections +# self.n_jobs = n_jobs +# self.calmode = calmode +# +# def _projection(self, doc_projector, lX): +# """ +# Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or +# decision_function if otherwise +# :param doc_projector: the document projector (a NaivePolylingualClassifier) +# :param lX: {lang:matrix} to train +# :return: the projection, applied with predict_proba or decision_function +# """ +# if self.calmode=='cal': +# return doc_projector.predict_proba(lX) +# else: +# l_decision_scores = doc_projector.decision_function(lX) +# if self.calmode=='sigmoid': +# def sigmoid(x): return 1 / (1 + np.exp(-x)) +# for lang in l_decision_scores.keys(): +# l_decision_scores[lang] = sigmoid(l_decision_scores[lang]) +# return l_decision_scores +# +# def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None): +# """ +# Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of +# decision scores (if otherwise). This space is here named zspace. +# :param lXtr: {lang:matrix} to train +# :param lYtr: {lang:labels} to train +# :param lXproj: {lang:matrix} to project (if None, then projects the lXtr) +# :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked) +# :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific +# models trained on lXtr, and the lYproj labels stacked consistently +# """ +# repair_empty_folds = True +# if lXproj is None and lYproj is None: +# lXproj, lYproj = lXtr, lYtr +# repair_empty_folds = False +# +# print('fitting the projectors... {}'.format(lXtr.keys())) +# self.doc_projector.fit(lXtr, lYtr) +# +# print('projecting the documents') +# langs = list(lXtr.keys()) +# lZ = self._projection(self.doc_projector, lXproj) +# +# # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version +# empty_categories = self.doc_projector.empty_categories +# lZ_bu = self._projection(self.doc_projector_bu, lXproj) +# +# for lang in langs: +# repair = empty_categories[lang] +# lZ[lang][:,repair] = lZ_bu[lang][:,repair] +# +# Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space +# zy = np.vstack([lYproj[lang] for lang in langs]) +# return Z, zy +# +# def _get_zspace_folds(self, lX, ly): +# self.doc_projector_bu.fit(lX, ly) +# +# print('split of {} folds'.format(self.folded_projections)) +# skf = KFold(n_splits=self.folded_projections, shuffle=True) +# +# Z, zy = [], [] +# lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()} +# for fold in range(self.folded_projections): +# print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections)) +# lfoldXtr, lfoldYtr = {}, {} +# lfoldXte, lfoldYte = {}, {} +# for lang in lX.keys(): +# train, test = lfold[lang][fold] +# lfoldXtr[lang] = lX[lang][train] +# lfoldYtr[lang] = ly[lang][train] +# lfoldXte[lang] = lX[lang][test] +# lfoldYte[lang] = ly[lang][test] +# Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte) +# Z.append(Zfold) +# zy.append(zYfold) +# # compose the Z-space as the union of all folded predictions +# Z = np.vstack(Z) +# zy = np.vstack(zy) +# # refit the document projector with all examples to have a more reliable projector for test data +# self.doc_projector = self.doc_projector_bu +# return Z, zy +# +# def fit(self, lX, ly, lZ=None, lzy=None): +# tinit = time.time() +# Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly) +# +# #experimental: adds the posterior probabilities (computed outside) to the meta-classifier +# if lZ is not None and lzy is not None: +# zlangs = list(lZ.keys()) +# Z = np.vstack((Z, *[lZ[l] for l in zlangs])) +# zy = np.vstack((zy, *[lzy[l] for l in zlangs])) +# +# print('fitting the Z-space of shape={}'.format(Z.shape)) +# self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs) +# self.model.fit(Z, zy) +# self.time = time.time() - tinit +# +# return self +# +# def predict(self, lX, lZ=None): +# """ +# :param lX: a dictionary {language_label: X csr-matrix} +# :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation +# :return: a dictionary of predictions +# """ +# lZ_ = self._projection(self.doc_projector, lX) +# if lZ is not None: +# lZ_ = {**lZ_, **lZ} +# return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs) +# +# def best_params(self): +# params = self.doc_projector.best_params() +# params['meta'] = self.model.best_params() +# return params class NaivePolylingualClassifier: @@ -322,411 +322,4 @@ class MonolingualClassifier: return self.model.predict(X) def best_params(self): - return self.best_params_ - - -class FunnellingMultimodal(FunnellingPolylingualClassifier): - def __init__(self, - we_path, - config, - first_tier_learner, - meta_learner, - first_tier_parameters=None, - meta_parameters=None, - folded_projections=1, - calmode='cal', - n_jobs=-1): - - super().__init__(first_tier_learner, - meta_learner, - first_tier_parameters, - meta_parameters, - folded_projections, - calmode, - n_jobs) - - self.pca_independent_space = PCA(n_components=50) - self.we_path = we_path - self.config = config - self.lang_word2idx = dict() - self.languages = [] - self.lang_tfidf = {} - self.embedding_space = None - self.model = None - self.time = None - self.best_components = 'not set' # if auto optimize pca, it will store the optimal number of components - - def vectorize(self, lX, prediction=False): - langs = list(lX.keys()) - print(f'# tfidf-vectorizing docs') - if prediction: - - for lang in langs: - assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language' - tfidf_vectorizer = self.lang_tfidf[lang] - lX[lang] = tfidf_vectorizer.transform(lX[lang]) - return self - - for lang in langs: - tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True) - self.languages.append(lang) - tfidf_vectorizer.fit(lX[lang]) - lX[lang] = tfidf_vectorizer.transform(lX[lang]) - self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ - self.lang_tfidf[lang] = tfidf_vectorizer - return self - - def _get_zspace(self, lXtr, lYtr): - print('\nfitting the projectors... {}'.format(list(lXtr.keys()))) - self.doc_projector.fit(lXtr, lYtr) - - print('\nprojecting the documents') - lZ = self._projection(self.doc_projector, lXtr) - - return lZ, lYtr - - def fit(self, lX, ly): - tinit = time.time() - print('Vectorizing documents...') - self.vectorize(lX) - - for lang in self.languages: - print(f'{lang}->{lX[lang].shape}') - - Z, zy = self._get_zspace(lX, ly) - - if self.config['supervised'] or self.config['unsupervised']: - self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) - _embedding_space = self.embedding_space.transform(self.config, lX) - if self.config['max_label_space'] == 0: - _cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1] - if _cum_dimension - 300 > 0: - _temp = _cum_dimension - 300 - else: - _temp = _cum_dimension - self.best_components = _temp - # h_stacking posterior probabilities with (U) and/or (S) matrices - for lang in self.languages: - Z[lang] = np.hstack((Z[lang], _embedding_space[lang])) - - # stacking Z space vertically - _vertical_Z = np.vstack([Z[lang] for lang in self.languages]) - _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) - - self.standardizer = StandardizeTransformer() - _vertical_Z = self.standardizer.fit_transform(_vertical_Z) - - # todo testing ... - # if self.config['post_pca']: - # print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') - # self.pca_independent_space.fit(_vertical_Z) - # _vertical_Z = self.pca_independent_space.transform(_vertical_Z) - - print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) - self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, - n_jobs=self.n_jobs) - self.model.fit(_vertical_Z, _vertical_Zy) - self.time = time.time() - tinit - print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min') - - def predict(self, lX, ly): - print('Vectorizing documents') - self.vectorize(lX, prediction=True) - lZ = self._projection(self.doc_projector, lX) - - if self.config['supervised'] or self.config['unsupervised']: - _embedding_space = self.embedding_space.transform(self.config, lX) - - for lang in lX.keys(): - lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang])) - - for lang in lZ.keys(): - print(lZ[lang].shape) - # todo testing - lZ[lang] = self.standardizer.transform(lZ[lang]) - # if self.config['post_pca']: - # print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') - # lZ[lang] = self.pca_independent_space.transform(lZ[lang]) - - return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) - - -class PolylingualEmbeddingsClassifier: - """ - This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article - @article{conneau2017word, - title={Word translation without parallel data}, - author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}}, - journal={arXiv preprint arXiv:1710.04087}, - year={2017} - } - url: https://github.com/facebookresearch/MUSE - """ - def __init__(self, wordembeddings_path, config, learner, c_parameters=None, n_jobs=-1): - """ - :param wordembeddings_path: the path to the directory containing the polylingual embeddings - :param learner: the learner - :param c_parameters: parameters for learner - :param n_jobs: the number of concurrent threads - """ - self.wordembeddings_path = wordembeddings_path - self.config = config - self.learner = learner - self.c_parameters=c_parameters - self.n_jobs = n_jobs - self.lang_tfidf = {} - self.model = None - self.languages = [] - self.lang_word2idx = dict() - self.embedding_space = None - - def fit_vectorizers(self, lX): - for lang in lX.keys(): - if lang not in self.lang_tfidf: - tfidf = TfidfVectorizer(sublinear_tf=True, use_idf=True) # text is already processed - docs = lX[lang] - tfidf.fit(docs) - self.lang_tfidf[lang] = tfidf - - - def vectorize(self, lX, prediction=False): - langs = list(lX.keys()) - print(f'# tfidf-vectorizing docs') - if prediction: - - for lang in langs: - assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language' - tfidf_vectorizer = self.lang_tfidf[lang] - lX[lang] = tfidf_vectorizer.transform(lX[lang]) - return self - - for lang in langs: - tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True) - self.languages.append(lang) - tfidf_vectorizer.fit(lX[lang]) - lX[lang] = tfidf_vectorizer.transform(lX[lang]) - self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ - self.lang_tfidf[lang] = tfidf_vectorizer - return self - - def embed(self, docs, lang): - assert lang in self.lang_tfidf, 'unknown language' - tfidf_vectorizer = self.lang_tfidf[lang] - V = tfidf_vectorizer.vocabulary_ - Xweights = tfidf_vectorizer.transform(docs) - - print('loading word embeddings for ' + lang) - we = WordEmbeddings.load(self.wordembeddings_path, lang) - - nD = len(docs) - doc_vecs = np.zeros((nD, we.dim())) - - for i, doc in enumerate(docs): - print('\r\tcomplete {:.3f}%'.format(100 * (i + 1) / nD), end='') - # averaging with tfidf (summing each word only once, since the frequency is already controlled) - for w in set(doc.split()): - if w in we and w in V: - doc_vecs[i] += (we[w] * Xweights[i, V[w]]) - # works much worse with idf; works much worse with document l2-normalization - print() - - return doc_vecs - - def fit(self, lX, ly): - """ - :param lX: a dictionary {language_label: [list of preprocessed documents]} - :param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels} - :return: self - """ - tinit = time.time() - langs = list(lX.keys()) - WEtr, Ytr = [], [] - # self.fit_vectorizers(lX) # if already fit, does nothing - self.vectorize(lX) - # config = {'unsupervised' : False, 'supervised': True} - self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly) - WEtr = self.embedding_space.transform(self.config, lX) - # for lang in langs: - # WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices - # Ytr.append(ly[lang]) - - WEtr = np.vstack([WEtr[lang] for lang in langs]) - Ytr = np.vstack([ly[lang] for lang in langs]) - self.embed_time = time.time() - tinit - - print('fitting the WE-space of shape={}'.format(WEtr.shape)) - self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs) - self.model.fit(WEtr, Ytr) - self.time = time.time() - tinit - return self - - def predict(self, lX, lY): - """ - :param lX: a dictionary {language_label: [list of preprocessed documents]} - """ - assert self.model is not None, 'predict called before fit' - self.vectorize(lX, prediction=True) - langs = list(lX.keys()) - lWEte = self.embedding_space.transform(self.config, lX) - # lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory - return _joblib_transform_multiling(self.model.transform, lWEte, n_jobs=self.n_jobs) - - def predict_proba(self, lX): - """ - :param lX: a dictionary {language_label: [list of preprocessed documents]} - """ - assert self.model is not None, 'predict called before fit' - langs = list(lX.keys()) - lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory - return _joblib_transform_multiling(self.model.predict_proba, lWEte, n_jobs=self.n_jobs) - - def best_params(self): - return self.model.best_params() - - -class MonolingualNetSvm: - """ - testing: funnelling with NN managing word embeddings compositionality. An ensemble of n-SVMs (n equals to the - number of training languages) is first fit on the data, generating the documents projection in the Z-space. Next, - the projection are fed to a single NN with their respective document embeddings. The documents are projected into - the embedding space while preserving their dimensionality (output dim is 300). These projection are horizonatally - concatenated with the respective projection and passed through a fC layer with sigmoid act and output dim equal - to the number of target classes. - # TODO ATM testing with only 1 language - """ - def __init__(self, lX, ly, first_tier_learner, first_tier_parameters, n_jobs): - self.lX = lX - self.ly = ly - # SVM Attributes - self.doc_projector = NaivePolylingualClassifier(first_tier_learner, first_tier_parameters, - n_jobs=n_jobs) - self.calmode = 'cal' - self.languages = [] - self.lang_word2idx = dict() - self.lang_tfidf = {} - self.base_learner = 'TODO' - self.parameters = 'TODO' - # NN Attributes - self.NN = 'TODO' - - - def load_preprocessed(self): - """ - in order to speed up the process, documents are first tokenized in the "main". Here, tokenized docs, word_index, and - targets are loaded. - :return: dict[lang] = (word_index, tokenized_docs, targets) - """ - import pickle - with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f: - return pickle.load(f) - - def _build_embedding_matrix(self, lang, word_index): - """ - build embedding matrix by filtering out OOV embeddings - :param lang: - :param word_index: - :return: filtered embedding matrix - """ - from embeddings.embeddings import EmbeddingsAligned - type = 'MUSE' - path = '/home/andreapdr/CLESA/' - MUSE = EmbeddingsAligned(type, path, lang, word_index.keys()) - return MUSE - - def get_data_and_embed(self, data_dict): - from keras.preprocessing.sequence import pad_sequences - - langs = data_dict.keys() - lang_embedding_matrix = dict() - nn_lXtr = dict() - nn_lytr = dict() - - for lang in langs: - lang_embedding_matrix[lang] = self._build_embedding_matrix(lang, data_dict[lang][0]) - nn_lXtr[lang] = pad_sequences(data_dict[lang][1], 100, padding='post') - nn_lytr[lang] = [data_dict[lang][2]] - - return nn_lXtr, nn_lytr, lang_embedding_matrix - - def svm_vectorize(self, lX, prediction=False): - langs = list(lX.keys()) - print(f'# tfidf-vectorizing docs') - if prediction: - for lang in langs: - assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language' - tfidf_vectorizer = self.lang_tfidf[lang] - lX[lang] = tfidf_vectorizer.transform(lX[lang]) - return self - for lang in langs: - tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True) - self.languages.append(lang) - tfidf_vectorizer.fit(lX[lang]) - lX[lang] = tfidf_vectorizer.transform(lX[lang]) - self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ - self.lang_tfidf[lang] = tfidf_vectorizer - return lX - - def _get_zspace(self, lXtr, lYtr): - print('\nfitting the projectors... {}'.format(list(lXtr.keys()))) - self.doc_projector.fit(lXtr, lYtr) - - print('\nprojecting the documents') - lZ = self._projection(self.doc_projector, lXtr) - - return lZ, lYtr - - def _projection(self, doc_projector, lX): - """ - Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or - decision_function if otherwise - :param doc_projector: the document projector (a NaivePolylingualClassifier) - :param lX: {lang:matrix} to train - :return: the projection, applied with predict_proba or decision_function - """ - if self.calmode=='cal': - return doc_projector.predict_proba(lX) - else: - l_decision_scores = doc_projector.decision_function(lX) - if self.calmode=='sigmoid': - def sigmoid(x): return 1 / (1 + np.exp(-x)) - for lang in l_decision_scores.keys(): - l_decision_scores[lang] = sigmoid(l_decision_scores[lang]) - return l_decision_scores - - def fit(self): - """ - # 1. Fit SVM to generate posterior probabilities: - # 1.1 Gather documents and vectorize them as in other SVM classifiers - # 2. Fit NN - # 2.1 Gather documents and build NN dataset by indexing wrt embedding matrix - # 2.2 Fit NN first-layer to generate compositional doc embedding - # 2.3 H-stack doc-embed and posterior P - # 2.4 Feed stacked vector to output layer (sigmoid act): output Nc - # 2.5 Train it... - """ - - # load pre-processed data - data_dict = self.load_preprocessed() - # build embedding matrices and neural network document training set - nn_lXtr, nn_lytr, lang_embedding_matrix = self.get_data_and_embed(data_dict) - # TF-IDF vectorzing documents for SVM classifier - svm_lX = self.svm_vectorize(self.lX) - - # just testing on a smaller subset of data - test_svm_lX = dict() - test_svm_ly = dict() - test_svm_lX['it'] = svm_lX['it'][:10, :] - test_svm_ly['it'] = self.ly['it'][:10, :] - test_nn_data = nn_lXtr['it'][:10] - - # projecting document into Z space by SVM - svm_Z, _ = self._get_zspace(test_svm_lX, test_svm_ly) - - # initializing net and forward pass - net = CNN_pdr(73, 1, 300, len(lang_embedding_matrix['it'].vectors), 300, lang_embedding_matrix['it'].vectors) - out = net.forward(test_nn_data, svm_Z['it']) - - print('TODO') - - def net(self): - pass \ No newline at end of file + return self.best_params_ \ No newline at end of file diff --git a/src/learning/transformers.py b/src/learning/transformers.py index 190c32d..29d35c8 100644 --- a/src/learning/transformers.py +++ b/src/learning/transformers.py @@ -10,7 +10,7 @@ import time from sklearn.decomposition import PCA from joblib import Parallel, delayed from scipy.sparse import issparse, vstack, hstack -from transformers.StandardizeTransformer import StandardizeTransformer +from util_transformers.StandardizeTransformer import StandardizeTransformer from util.SIF_embed import remove_pc from sklearn.preprocessing import normalize from sklearn.svm import SVC @@ -127,22 +127,26 @@ class PosteriorProbabilitiesEmbedder: print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents') return self.doc_projector.predict_proba(lX) + def _get_output_dim(self): + return len(self.doc_projector.model['da'].model.classes_) + class MuseEmbedder: - def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight()): + def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False): self.path=path self.lV = lV self.l2 = l2 self.n_jobs = n_jobs self.featureweight = featureweight + self.sif = sif def fit(self, lX, ly, lV=None): assert lV is not None or self.lV is not None, 'lV not specified' self.langs = sorted(lX.keys()) self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs) lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs} - self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE} + self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE.items()} self.featureweight.fit(lX, ly) return self @@ -150,7 +154,7 @@ class MuseEmbedder: MUSE = self.MUSE lX = self.featureweight.transform(lX) XdotMUSE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], MUSE[lang]) for lang in self.langs + delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs ) lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)} lMuse = _normalize(lMuse, self.l2) @@ -162,14 +166,18 @@ class MuseEmbedder: def _get_wordlist_from_word2index(self, word2index): return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0] + def _get_output_dim(self): + return self.MUSE['da'].shape[1] + class WordClassEmbedder: - def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight()): + def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False): self.n_jobs = n_jobs self.l2 = l2 self.max_label_space=max_label_space self.featureweight = featureweight + self.sif = sif def fit(self, lX, ly, lV=None): self.langs = sorted(lX.keys()) @@ -184,7 +192,7 @@ class WordClassEmbedder: lWCE = self.lWCE lX = self.featureweight.transform(lX) XdotWCE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], lWCE[lang])for lang in self.langs + delayed(XdotM)(lX[lang], lWCE[lang], self.sif)for lang in self.langs ) lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)} lwce = _normalize(lwce, self.l2) @@ -193,6 +201,9 @@ class WordClassEmbedder: def fit_transform(self, lX, ly, lV=None): return self.fit(lX, ly).transform(lX) + def _get_output_dim(self): + return 73 + class DocEmbedderList: @@ -201,6 +212,7 @@ class DocEmbedderList: if len(embedder_list)==0: embedder_list=[] self.embedders = embedder_list self.aggregation = aggregation + print(f'Aggregation mode: {self.aggregation}') def fit(self, lX, ly, lV=None): for transformer in self.embedders: @@ -238,16 +250,25 @@ class DocEmbedderList: langs = sorted(lX.keys()) lZparts = {l: None for l in langs} + # min_dim = min([transformer._get_output_dim() for transformer in self.embedders]) + min_dim = 300 for transformer in self.embedders: lZ = transformer.transform(lX) + nC = min([lZ[lang].shape[1] for lang in langs]) for l in langs: Z = lZ[l] + if Z.shape[1] > min_dim: + print(f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.' + f'Applying PCA(n_components={min_dim})') + pca = PCA(n_components=min_dim) + Z = pca.fit(Z).transform(Z) if lZparts[l] is None: lZparts[l] = Z else: lZparts[l] += Z n_transformers = len(self.embedders) + nC = min([lZparts[lang].shape[1] for lang in langs]) return {l:lZparts[l] / n_transformers for l in langs} @@ -266,7 +287,7 @@ class FeatureSet2Posteriors: self.transformer = transformer self.l2=l2 self.n_jobs = n_jobs - self.prob_classifier = MetaClassifier(SVC(kernel='rbf', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) + self.prob_classifier = MetaClassifier(SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) def fit(self, lX, ly, lV=None): if lV is None and hasattr(self.transformer, 'lV'): @@ -412,11 +433,13 @@ def word_class_embedding_matrix(X, Y, max_label_space=300): return WCE -def XdotM(X,M): +def XdotM(X,M, sif): # return X.dot(M) - # print(f'X={X.shape}, M={M.shape}') + print(f'X={X.shape}, M={M.shape}') E = X.dot(M) - E = remove_pc(E, npc=1) + if sif: + print("removing pc...") + E = remove_pc(E, npc=1) return E diff --git a/src/main_deep.py b/src/main_deep.py deleted file mode 100644 index 156d726..0000000 --- a/src/main_deep.py +++ /dev/null @@ -1,92 +0,0 @@ -from optparse import OptionParser -from util.results import PolylingualClassificationResults -from dataset_builder import MultilingualDataset -from keras.preprocessing.text import Tokenizer -from learning.learners import MonolingualNetSvm -from sklearn.svm import SVC -import pickle - -parser = OptionParser() - -parser.add_option("-d", "--dataset", dest="dataset", - help="Path to the multilingual dataset processed and stored in .pickle format", - default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") - -parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimize hyperparameters", default=False) - -parser.add_option("-s", "--set_c", dest="set_c",type=float, - help="Set the C parameter", default=1) - -(op, args) = parser.parse_args() - - -################################################################################################################### - -def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') - - -def get_params(dense=False): - if not op.optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' if dense else 'linear' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - - -# PREPROCESS TEXT AND SAVE IT ... both for SVM and NN -def preprocess_data(lXtr, lXte, lytr, lyte): - tokenized_tr = dict() - tokenized_te = dict() - for lang in lXtr.keys(): - alltexts = ' '.join(lXtr[lang]) - tokenizer = Tokenizer() - tokenizer.fit_on_texts(alltexts.split(' ')) - tokenizer.oov_token = len(tokenizer.word_index)+1 - # dumping train set - sequences_tr = tokenizer.texts_to_sequences(lXtr[lang]) - tokenized_tr[lang] = (tokenizer.word_index, sequences_tr, lytr[lang]) - # dumping test set - sequences_te = tokenizer.texts_to_sequences(lXte[lang]) - tokenized_te[lang] = (tokenizer.word_index, sequences_te, lyte[lang]) - - with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'wb') as f: - pickle.dump(tokenized_tr, f) - - with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_test.pickle', 'wb') as f: - pickle.dump(tokenized_tr, f) - - print('Successfully dumped data') - -# def load_preprocessed(): -# with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f: -# return pickle.load(f) -# -# def build_embedding_matrix(lang, word_index): -# type = 'MUSE' -# path = '/home/andreapdr/CLESA/' -# MUSE = EmbeddingsAligned(type, path, lang, word_index.keys()) -# return MUSE - - -########## MAIN ################################################################################################# - -if __name__ == '__main__': - results = PolylingualClassificationResults('./results/NN_FPEC_results.csv') - data = MultilingualDataset.load(op.dataset) - lXtr, lytr = data.training() - lXte, lyte = data.test() - - if op.set_c != -1: - meta_parameters = None - else: - meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}] - - test_architecture = MonolingualNetSvm(lXtr, - lytr, - first_tier_learner=get_learner(calibrate=True), - first_tier_parameters=None, - n_jobs=1) - - test_architecture.fit() diff --git a/src/main_deep_learning.py b/src/main_deep_learning.py index 5fc2a94..d330b04 100755 --- a/src/main_deep_learning.py +++ b/src/main_deep_learning.py @@ -1,6 +1,6 @@ import argparse import torch.nn as nn -from torch.optim.lr_scheduler import StepLR +from torch.optim.lr_scheduler import StepLR, MultiStepLR from dataset_builder import MultilingualDataset from learning.transformers import load_muse_embeddings from models.lstm_class import RNNMultilingualClassifier @@ -9,8 +9,6 @@ from util.early_stop import EarlyStopping from util.common import * from util.file import create_if_not_exist from time import time -from embeddings.pretrained import * -from os.path import join from tqdm import tqdm from util.evaluation import evaluate from util.file import get_file_name @@ -100,7 +98,7 @@ def main(): # Loading the dataset data = MultilingualDataset.load(opt.dataset) - # data.set_view(languages=['de', 'fr', 'sv', 'da', 'es', 'it']) + data.set_view(languages=['de', 'fr']) #, 'it', 'en']) # 'sv', 'da', 'es', 'it']) data.show_dimensions() langs = data.langs() l_devel_raw, l_devel_target = data.training(target_as_csr=True) @@ -108,6 +106,7 @@ def main(): # Loading the MUSE pretrained embeddings (only if requested) lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs) + # lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set # Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs multilingual_index = MultilingualIndex() @@ -115,10 +114,26 @@ def main(): multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed) multilingual_index.embedding_matrices(lpretrained, opt.supervised) if opt.posteriors: - lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=opt.svm_max_docs) + lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000, store_posteriors=True) #stored_post=True) #opt.svm_max_docs) else: lPtr, lPva, lPte = None, None, None + # just_test = False + # if just_test: + # + # model = torch.load( + # '../checkpoint/rnn(H512)-Muse-WCE-Posteriors-(trainable)-jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle') + # criterion = torch.nn.BCEWithLogitsLoss().cuda() + # + # # batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad()) + # + # batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad()) + # l_test_index = multilingual_index.l_test_index() + # epoch = 1 + # tinit = time() + # test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te') + # exit('Loaded') + # Model initialization model = init_Net(data.num_categories(), multilingual_index) @@ -130,7 +145,7 @@ def main(): tinit = time() create_if_not_exist(opt.checkpoint_dir) - early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}') + early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}') l_train_index, l_train_target = multilingual_index.l_train() l_val_index, l_val_target = multilingual_index.l_val() @@ -155,7 +170,6 @@ def main(): break # training is over - # restores the best model according to the Mf1 of the validation set (only when plotmode==False) # stoptime = early_stop.stop_time - tinit # stopepoch = early_stop.best_epoch @@ -164,6 +178,8 @@ def main(): if opt.plotmode==False: print('-' * 80) print('Training over. Performing final evaluation') + + # torch.cuda.empty_cache() model = early_stop.restore_checkpoint() if opt.val_epochs>0: @@ -183,10 +199,14 @@ def get_lr(optimizer): def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name): + _dataset_path = opt.dataset.split('/')[-1].split('_') + dataset_id = _dataset_path[0] + _dataset_path[-1] + loss_history = [] model.train() for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)): optim.zero_grad() + _out = model(batch,post, lang) loss = criterion(model(batch, post, lang), target) loss.backward() clip_gradient(model) @@ -195,7 +215,7 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, if idx % opt.log_interval == 0: interval_loss = np.mean(loss_history[-opt.log_interval:]) - print(f'{opt.dataset} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') + print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') mean_loss = np.mean(interval_loss) logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) @@ -203,6 +223,8 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix): + + loss_history = [] model.eval() langs = sorted(ltest_index.keys()) predictions = {l:[] for l in langs} @@ -214,6 +236,7 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf prediction = predict(logits) predictions[lang].append(prediction) yte_stacked[lang].append(target.detach().cpu().numpy()) + loss_history.append(loss) ly = {l:np.vstack(yte_stacked[l]) for l in langs} ly_ = {l:np.vstack(predictions[l]) for l in langs} @@ -224,17 +247,15 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf metrics.append([macrof1, microf1, macrok, microk]) if measure_prefix=='te': print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], - # (config['max_label_space'], classifier.best_components), - # config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, - # lang, macrof1, microf1, macrok, microk, '') Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=tend) - # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mf1, timelapse=tend) - # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-accuracy', value=acc, timelapse=tend) - # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=loss, timelapse=tend) + mean_loss = np.mean(loss_history) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) return Mf1 diff --git a/src/main_majorityvoting_cls.py b/src/main_majorityvoting_cls.py index 607c409..0ae4f1e 100644 --- a/src/main_majorityvoting_cls.py +++ b/src/main_majorityvoting_cls.py @@ -1,7 +1,7 @@ import os from dataset_builder import MultilingualDataset # from learning.learners import * -from learning.learners import FunnellingMultimodal +# from learning.learners import FunnellingMultimodal from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \ TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting from util.evaluation import * @@ -14,14 +14,14 @@ from sklearn.linear_model import LogisticRegression, LogisticRegressionCV parser = OptionParser() -parser.add_option("-d", "--dataset", dest="dataset", - help="Path to the multilingual dataset processed and stored in .pickle format", - default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") +# parser.add_option("-d", "--dataset", dest="dataset", +# help="Path to the multilingual dataset processed and stored in .pickle format", +# default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") parser.add_option("-o", "--output", dest="output", help="Result file", type=str, default='./results/results.csv') -parser.add_option("-P", "--probs", dest="probs", action='store_true', +parser.add_option("-P", "--probs", dest="posteriors", action='store_true', help="Add posterior probabilities to the document embedding representation", default=False) parser.add_option("-S", "--supervised", dest="supervised", action='store_true', @@ -46,6 +46,9 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int, help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", default=300) +parser.add_option("-r", "--remove-pc", dest="sif", action='store_true', + help="Remove common component when computing dot product of word embedding matrices", default=False) + # parser.add_option("-u", "--upca", dest="max_labels_U", type=int, # help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." # " If set to 0 it will automatically search for the best number of components", default=300) @@ -72,15 +75,18 @@ def get_params(dense=False): if __name__ == '__main__': (op, args) = parser.parse_args() - assert exists(op.dataset), 'Unable to find file '+str(op.dataset) - assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' - assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed' + assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)' + dataset = args[0] - dataset_file = os.path.basename(op.dataset) + assert exists(dataset), 'Unable to find file '+str(dataset) + assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' + assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed' + + dataset_file = os.path.basename(dataset) results = PolylingualClassificationResults(op.output) - data = MultilingualDataset.load(op.dataset) + data = MultilingualDataset.load(dataset) data.show_dimensions() lXtr, lytr = data.training() @@ -88,8 +94,9 @@ if __name__ == '__main__': meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] - result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}' - + # result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}' + result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \ + f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}' print(f'{result_id}') # text preprocessing @@ -100,7 +107,7 @@ if __name__ == '__main__': lV = tfidfvectorizer.vocabulary() classifiers = [] - if op.probs: + if op.posteriors: classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None)) if op.supervised: classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S))) @@ -115,13 +122,37 @@ if __name__ == '__main__': print('\n# Evaluating ...') l_eval = evaluate_method(classifier, lXte, lyte) + # renaming arguments to be printed on log + _id = '' + _id_conf = [op.posteriors, op.supervised, op.pretrained] + _id_name = ['+P', '+W', '+M'] + for i, conf in enumerate(_id_conf): + if conf: + _id += _id_name[i] + _id = _id.lstrip('+') + _dataset_path = dataset.split('/')[-1].split('_') + dataset_id = _dataset_path[0] + _dataset_path[-1] + metrics = [] for lang in lXte.keys(): macrof1, microf1, macrok, microk = l_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], - # (config['max_label_space'], classifier.best_components), - # config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, - # lang, macrof1, microf1, macrok, microk, '') + results.add_row(method='Voting', + learner='svm', + optimp=op.optimc, + sif=op.sif, + zscore='todo', + l2='todo', + wescaler='todo', + pca=op.max_labels_S, + id=_id, + dataset=dataset_id, + time='todo', + lang=lang, + macrof1=macrof1, + microf1=microf1, + macrok=macrok, + microk=microk, + notes='') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/main_multimodal_cls.py b/src/main_multimodal_cls.py index a5224ab..e5859b7 100644 --- a/src/main_multimodal_cls.py +++ b/src/main_multimodal_cls.py @@ -11,7 +11,7 @@ from sklearn.svm import SVC parser = OptionParser(usage="usage: %prog datapath [options]") parser.add_option("-o", "--output", dest="output", - help="Result file", type=str, default='./results/results.csv') + help="Result file", type=str, default='multiModal_log.csv') parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true', help="Add posterior probabilities to the document embedding representation", default=False) @@ -22,8 +22,8 @@ parser.add_option("-S", "--supervised", dest="supervised", action='store_true', parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true', help="Add pretrained MUSE embeddings to the document embedding representation", default=False) -parser.add_option("--nol2", dest="nol2", action='store_true', - help="Deactivates l2 normalization as a post-processing for the document embedding views", default=False) +parser.add_option("--l2", dest="l2", action='store_true', + help="Activates l2 normalization as a post-processing for the document embedding views", default=False) parser.add_option("--allprob", dest="allprob", action='store_true', help="All views are generated as posterior probabilities. This affects the supervised and pretrained " @@ -48,11 +48,28 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int, help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", default=300) +parser.add_option("-r", "--remove-pc", dest="sif", action='store_true', + help="Remove common component when computing dot product of word embedding matrices", default=False) + +parser.add_option("-z", "--zscore", dest="zscore", action='store_true', + help="Z-score normalize matrices (WCE and MUSE)", default=False) + +parser.add_option("-a", "--agg", dest="agg", action='store_true', + help="Set aggregation function of the common Z-space to average (Default: concatenation)", default=False) + def get_learner(calibrate=False, kernel='linear'): return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto') +def get_params(): + if not op.optimc: + return None + c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] + kernel = 'rbf' + return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] + + ####################################################################################################################### @@ -64,17 +81,23 @@ if __name__ == '__main__': assert exists(dataset), 'Unable to find file '+str(dataset) assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed' - l2=(op.nol2==False) + l2=op.l2 dataset_file = os.path.basename(dataset) - results = PolylingualClassificationResults(op.output) + results = PolylingualClassificationResults('../log/' + op.output) allprob='Prob' if op.allprob else '' result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \ - f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}{"_optimC" if op.optimc else ""}' + f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}_zscore={op.zscore}{"_optimC" if op.optimc else ""}' print(f'{result_id}') + # set zscore range - is slice(0,0) mean will be equal to 0 and std to 1, thus normalization will have no effect + standardize_range = slice(0,0) + if op.zscore: + standardize_range = None + data = MultilingualDataset.load(dataset) + # data.set_view(languages=['fr', 'it']) data.show_dimensions() lXtr, lytr = data.training() lXte, lyte = data.test() @@ -86,23 +109,23 @@ if __name__ == '__main__': feat_weighting = FeatureWeight(op.feat_weight, agg='mean') # # document embedding modules - doc_embedder = DocEmbedderList(aggregation='concat') + doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat') if op.posteriors: doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2)) if op.supervised: - wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting) + wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif) if op.allprob: wce = FeatureSet2Posteriors(wce, l2=l2) doc_embedder.append(wce) if op.pretrained: - muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting) + muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif) if op.allprob: muse = FeatureSet2Posteriors(muse, l2=l2) doc_embedder.append(muse) # metaclassifier meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] - meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=meta_parameters) + meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=get_params(), standardize_range=standardize_range) # ensembling the modules classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) @@ -113,13 +136,40 @@ if __name__ == '__main__': print('\n# Evaluating ...') l_eval = evaluate_method(classifier, lXte, lyte) + # renaming arguments to be printed on log + _id = '' + _id_conf = [op.posteriors, op.supervised, op.pretrained] + _id_name = ['+P', '+W', '+M'] + for i, conf in enumerate(_id_conf): + if conf: + _id += _id_name[i] + _id = _id.lstrip('+') + _id = _id if not op.agg else _id + '_mean' + _id = _id if not op.allprob else _id + '_allprob' + + _dataset_path = dataset.split('/')[-1].split('_') + dataset_id = _dataset_path[0] + _dataset_path[-1] + metrics = [] for lang in lXte.keys(): macrof1, microf1, macrok, microk = l_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], - # (config['max_label_space'], classifier.best_components), - # config['dim_reduction_unsupervised'], op.optimc, dataset.split('/')[-1], classifier.time, - # lang, macrof1, microf1, macrok, microk, '') + results.add_row(method='MultiModal', + learner='svm', + optimp=op.optimc, + sif= op.sif, + zscore=op.zscore, + l2= op.l2, + wescaler= op.feat_weight, + pca=op.max_labels_S, + id=_id, + dataset=dataset_id, + time='todo', + lang=lang, + macrof1=macrof1, + microf1=microf1, + macrok=macrok, + microk=microk, + notes='') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/models/lstm_class.py b/src/models/lstm_class.py index 6d2e242..727f3ce 100755 --- a/src/models/lstm_class.py +++ b/src/models/lstm_class.py @@ -27,7 +27,7 @@ class RNNMultilingualClassifier(nn.Module): self.n_layers = 1 self.n_directions = 1 - self.dropout = nn.Dropout(0.2) + self.dropout = nn.Dropout(0.6) lstm_out = 256 ff1 = 512 @@ -45,7 +45,7 @@ class RNNMultilingualClassifier(nn.Module): llearnable_embeddings[l] = learnable_embeddings self.embedding_length = embedding_length - # self.rnn = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2)) + # self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2)) self.rnn = nn.GRU(self.embedding_length, hidden_size) self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) self.lpretrained_embeddings.update(lpretrained_embeddings) diff --git a/src/new_mbert.py b/src/new_mbert.py new file mode 100644 index 0000000..62b6dde --- /dev/null +++ b/src/new_mbert.py @@ -0,0 +1,355 @@ +""" +Test with smaller subset of languages. + +1. Load doc (RCV1/2) +2. Tokenize texts via bertTokenizer (I should already have these dumps) +3. Construct better Dataloader/Datasets. NB: I need to keep track of the languages only for +the testing phase (but who cares actually? If I have to do it for the testing phase, I think +it is better to deploy it also in the training phase...) +4. ... +5. I have to understand if the pooled hidden state of the last layer is way worse than its averaged +version (However, in BertForSeqClassification I guess that the pooled version is passed through +the output linear layer in order to get the prediction scores?) +6. At the same time, I have to build also an end-to-end model in order to fine-tune it. The previous step +would be useful when deploying mBert as a View Generator. (Refactor gFun code with view generators?) +7. ... +8. Profits + +""" +from dataset_builder import MultilingualDataset +from transformers import BertTokenizer, BertForSequenceClassification, AdamW +from torch.utils.data import Dataset, DataLoader +import numpy as np +import torch +from util.common import clip_gradient, predict +from time import time +from util.csv_log import CSVLog +from util.evaluation import evaluate +from util.early_stop import EarlyStopping +from torch.optim.lr_scheduler import StepLR +from sklearn.model_selection import train_test_split +import argparse + + +def get_model(n_out): + print('# Initializing model ...') + model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out) + return model + +def set_method_name(): + return 'mBERT' + +def init_optimizer(model, lr): + # return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay) + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in model.named_parameters() + if not any(nd in n for nd in no_decay)], + 'weight_decay': opt.weight_decay}, + {'params': [p for n, p in model.named_parameters() + if any(nd in n for nd in no_decay)], + 'weight_decay': opt.weight_decay} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=lr) + return optimizer + +def init_logfile(method_name, opt): + logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) + logfile.set_default('dataset', opt.dataset) + logfile.set_default('run', opt.seed) + logfile.set_default('method', method_name) + assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated' + return logfile + +def get_lr(optimizer): + for param_group in optimizer.param_groups: + return param_group['lr'] + +def get_dataset_name(datapath): + possible_splits = [str(i) for i in range(10)] + splitted = datapath.split('_') + id_split = splitted[-1].split('.')[0][-1] + if id_split in possible_splits: + dataset_name = splitted[0].split('/')[-1] + return f'{dataset_name}_run{id_split}' + +def load_datasets(datapath): + data = MultilingualDataset.load(datapath) + data.set_view(languages=['nl']) # Testing with just two langs + data.show_dimensions() + + l_devel_raw, l_devel_target = data.training(target_as_csr=False) + l_test_raw, l_test_target = data.test(target_as_csr=False) + + return l_devel_raw, l_devel_target, l_test_raw, l_test_target + + +def do_tokenization(l_dataset, max_len=512): + print('# Starting Tokenization ...') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + langs = l_dataset.keys() + l_tokenized = {} + for lang in langs: + l_tokenized[lang] = tokenizer(l_dataset[lang], + truncation=True, + max_length=max_len, + add_special_tokens=True, + padding='max_length') + return l_tokenized + + +class TrainingDataset(Dataset): + """ + data: dict of lang specific tokenized data + labels: dict of lang specific targets + """ + def __init__(self, data, labels): + self.langs = data.keys() + self.lang_ids = {lang:identifier for identifier, lang in enumerate(self.langs)} + + for i, lang in enumerate(self.langs): + # print(lang) + _data = data[lang]['input_ids'] + _data = np.array(_data) + _labels = labels[lang] + _lang_value = np.full(len(_data), self.lang_ids[lang]) + + if i == 0: + self.data = _data + self.labels = _labels + self.lang_index = _lang_value + else: + self.data = np.vstack((self.data, _data)) + self.labels = np.vstack((self.labels, _labels)) + self.lang_index = np.concatenate((self.lang_index, _lang_value)) + + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + x = self.data[idx] + y = self.labels[idx] + lang = self.lang_index[idx] + + return x, torch.tensor(y, dtype=torch.float), lang + # return x, y, lang + + def get_lang_ids(self): + return self.lang_ids + +def freeze_encoder(model): + for param in model.base_model.parameters(): + param.requires_grad = False + return model + +def check_param_grad_status(model): + print('#'*50) + print('Model paramater status') + for name, child in model.named_children(): + trainable = False + for param in child.parameters(): + if param.requires_grad: + trainable = True + if not trainable: + print(f'{name} is frozen') + else: + print(f'{name} is not frozen') + print('#'*50) + +def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile): + _dataset_path = opt.dataset.split('/')[-1].split('_') + # dataset_id = 'RCV1/2_run0_newBert' + dataset_id = _dataset_path[0] + _dataset_path[-1] + + loss_history = [] + model.train() + + for idx, (batch, target, lang_idx) in enumerate(train_dataloader): + # optim.zero_grad() + out = model(batch.cuda()) + loss = criterion(out[0], target.cuda()) + loss.backward() + clip_gradient(model) + optim.step() + loss_history.append(loss.item()) + + if idx % opt.log_interval == 0: + interval_loss = np.mean(loss_history[-opt.log_interval:]) + print( + f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') + + mean_loss = np.mean(interval_loss) + logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) + return mean_loss + +def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix): + print('# Validating model ...') + loss_history = [] + model.eval() + langs = lang_ids.keys() + id_2_lang = {v:k for k,v in lang_ids.items()} + predictions = {l: [] for l in langs} + yte_stacked = {l: [] for l in langs} + + for batch, target, lang_idx in test_dataloader: + out = model(batch.cuda()) + logits = out[0] + loss = criterion(logits, target.cuda()).item() + prediction = predict(logits) + loss_history.append(loss) + + # Assigning prediction to dict in predictionS and yte_stacked according to lang_idx + for i, pred in enumerate(prediction): + lang_pred = id_2_lang[lang_idx.numpy()[i]] + predictions[lang_pred].append(pred) + yte_stacked[lang_pred].append(target[i].detach().cpu().numpy()) + + ly = {l: np.vstack(yte_stacked[l]) for l in langs} + ly_ = {l: np.vstack(predictions[l]) for l in langs} + l_eval = evaluate(ly, ly_) + metrics = [] + for lang in langs: + macrof1, microf1, macrok, microk = l_eval[lang] + metrics.append([macrof1, microf1, macrok, microk]) + if measure_prefix == 'te': + print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') + Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) + print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') + + mean_loss = np.mean(loss_history) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) + + return Mf1 + +def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed): + l_split_va = l_tokenized_tr + l_split_val_target = {l: [] for l in l_tokenized_tr.keys()} + l_split_tr = l_tokenized_tr + l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()} + + for lang in l_tokenized_tr.keys(): + val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val)) + + l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[lang] = \ + train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, random_state=seed, shuffle=True) + + return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target + +def main(): + print('Running main ...') + + DATAPATH = opt.dataset + method_name = set_method_name() + logfile = init_logfile(method_name, opt) + + l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH) + l_tokenized_tr = do_tokenization(l_devel_raw, max_len=512) + + l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop=0.2, max_val=2000, seed=opt.seed) + + l_tokenized_te = do_tokenization(l_test_raw, max_len=512) + + tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) + va_dataset = TrainingDataset(l_split_va, l_split_val_target) + te_dataset = TrainingDataset(l_tokenized_te, l_test_target) + + tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True) + va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=False) + te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False) + + # Initializing model + model = get_model(73) + model = model.cuda() + criterion = torch.nn.BCEWithLogitsLoss().cuda() + optim = init_optimizer(model, lr=opt.lr) + # lr_scheduler = StepLR(optim, step_size=25, gamma=0.5) + early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, + checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_dataset_name(opt.dataset)}') + # lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optim, num_warmup_steps= , num_training_steps=) + # print(model) + + # Freezing encoder + # model = freeze_encoder(model) + check_param_grad_status(model) + + # Training loop + tinit = time() + lang_ids = va_dataset.lang_ids + for epoch in range(1, opt.nepochs+1): + print('# Start Training ...') + train(model, tr_dataloader, epoch, criterion, optim, 'TestingBert', tinit, logfile) + # lr_scheduler.step(epoch=None) # reduces the learning rate + + # validation + macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va') + early_stop(macrof1, epoch) + if opt.test_each>0: + if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch0: + print(f'running last {opt.val_epochs} training epochs on the validation set') + for val_epoch in range(1, opt.val_epochs + 1): + train(model, va_dataloader, epoch+val_epoch, criterion, optim, 'TestingBert', tinit, logfile) + + # final test + print('Training complete: testing') + test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te') + + exit('Code Executed!') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model') + + parser.add_argument('--dataset', type=str, default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle', + metavar='datasetpath', help=f'path to the pickled dataset') + parser.add_argument('--nepochs', type=int, default=200, metavar='int', + help='number of epochs (default: 200)') + parser.add_argument('--lr', type=float, default=2e-5, metavar='float', + help='learning rate (default: 2e-5)') + parser.add_argument('--weight_decay', type=float, default=0, metavar='float', + help='weight decay (default: 0)') + parser.add_argument('--patience', type=int, default=10, metavar='int', + help='patience for early-stop (default: 10)') + parser.add_argument('--log-interval', type=int, default=20, metavar='int', + help='how many batches to wait before printing training status') + parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str', + help='path to the log csv file') + parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') + parser.add_argument('--force', action='store_true', default=False, + help='do not check if this experiment has already been run') + parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', + help='path to the directory containing checkpoints') + parser.add_argument('--plotmode', action='store_true', default=False, + help='in plot mode executes a long run in order ' + 'to generate enough data to produce trend plots (test-each should be >0. This mode is ' + 'used to produce plots, and does not perform an evaluation on the test set.') + parser.add_argument('--test-each', type=int, default=0, metavar='int', + help='how many epochs to wait before invoking test (default: 0, only at the end)') + parser.add_argument('--val-epochs', type=int, default=1, metavar='int', + help='number of training epochs to perform on the validation set once training is over (default 1)') + opt = parser.parse_args() + + # Testing different parameters ... + opt.weight_decay = 0.01 + opt.patience = 5 + + main() + # TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size \ No newline at end of file diff --git a/src/results/results_manager.py b/src/results/results_manager.py index fdee8d8..1fe57dd 100644 --- a/src/results/results_manager.py +++ b/src/results/results_manager.py @@ -1,7 +1,11 @@ import pandas as pd import numpy as np -df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t') -pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'embed'], aggfunc=[np.mean, np.std]) -print(pivot) -print('Finished ...') \ No newline at end of file +# df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t') +df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t') +pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std]) +with pd.option_context('display.max_rows', None): + print(pivot.round(3)) +print('Finished ...') + + diff --git a/src/run_mbert_rcv.sh b/src/run_mbert_rcv.sh new file mode 100644 index 0000000..810ce46 --- /dev/null +++ b/src/run_mbert_rcv.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run +logfile=../log/log_Mbert_rcv.csv + +runs='0 1 2 3 4 5 6 7 8 9' +for run in $runs +do + dataset=$dataset_path$run.pickle + python new_mbert.py --dataset $dataset --log-file $logfile --test-each 20 +done diff --git a/src/util/SIF_embed.py b/src/util/SIF_embed.py index 05e2ff7..cfe096e 100644 --- a/src/util/SIF_embed.py +++ b/src/util/SIF_embed.py @@ -17,7 +17,7 @@ def get_weighted_average(We, x, w): def compute_pc(X,npc=1): """ - Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN! + Compute the principal components. :param X: X[i,:] is a data point :param npc: number of principal components to remove :return: component_[i,:] is the i-th pc diff --git a/src/util/common.py b/src/util/common.py index 3bf1386..8a9a880 100755 --- a/src/util/common.py +++ b/src/util/common.py @@ -1,4 +1,5 @@ import warnings +import time from sklearn.svm import SVC from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split @@ -143,6 +144,15 @@ class Index: embedding_parts.append(F) + make_dumps = False + if make_dumps: + print(f'Dumping Embedding Matrices ...') + import pickle + with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile: + pickle.dump((self.lang, embedding_parts, self.word2index), outfile) + with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2: + pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2) + self.embedding_matrix = torch.cat(embedding_parts, dim=1) print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]') @@ -155,6 +165,7 @@ class MultilingualIndex: def __init__(self): #, add_language_trace=False): self.l_index = {} self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + # self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000) # self.add_language_trace=add_language_trace def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary): @@ -189,30 +200,42 @@ class MultilingualIndex: # pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1) - def posterior_probabilities(self, max_training_docs_by_lang=5000): + def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False): # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs + timeit = time.time() lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()} lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()} - for l in self.langs: - n_elements = lXtr[l].shape[0] - if n_elements > max_training_docs_by_lang: - choice = np.random.permutation(n_elements)[:max_training_docs_by_lang] - lXtr[l] = lXtr[l][choice] - lYtr[l] = lYtr[l][choice] + if not stored_post: + for l in self.langs: + n_elements = lXtr[l].shape[0] + if n_elements > max_training_docs_by_lang: + choice = np.random.permutation(n_elements)[:max_training_docs_by_lang] + lXtr[l] = lXtr[l][choice] + lYtr[l] = lYtr[l][choice] - # train the posterior probabilities embedder - print('[posteriors] training a calibrated SVM') - learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto') - prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False) - prob_embedder.fit(lXtr, lYtr) + # train the posterior probabilities embedder + print('[posteriors] training a calibrated SVM') + learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto') + prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False) + prob_embedder.fit(lXtr, lYtr) - # transforms the training, validation, and test sets into posterior probabilities - print('[posteriors] generating posterior probabilities') - lPtr = prob_embedder.transform(self.get_lXtr()) - lPva = prob_embedder.transform(self.get_lXva()) - lPte = prob_embedder.transform(self.get_lXte()) - - print('[posteriors] done') + # transforms the training, validation, and test sets into posterior probabilities + print('[posteriors] generating posterior probabilities') + lPtr = prob_embedder.transform(self.get_lXtr()) + lPva = prob_embedder.transform(self.get_lXva()) + lPte = prob_embedder.transform(self.get_lXte()) + # NB: Check splits indices ! + if store_posteriors: + import pickle + with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile: + pickle.dump([lPtr, lPva, lPte], outfile) + print(f'Successfully dumped posteriors!') + else: + import pickle + with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile: + lPtr, lPva, lPte = pickle.load(infile) + print(f'Successfully loaded stored posteriors!') + print(f'[posteriors] done in {time.time() - timeit}') return lPtr, lPva, lPte def get_lXtr(self): diff --git a/src/util/early_stop.py b/src/util/early_stop.py index 93544be..d534554 100755 --- a/src/util/early_stop.py +++ b/src/util/early_stop.py @@ -6,7 +6,7 @@ from util.file import create_if_not_exist class EarlyStopping: - def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'): + def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt'): # set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters self.patience_limit = patience self.patience = patience @@ -16,9 +16,10 @@ class EarlyStopping: self.stop_time = None self.checkpoint = checkpoint self.model = model + self.optimizer = optimizer self.STOP = False - def __call__(self, watch_score, epoch): + def __call__(self, watch_score, epoch): #model if self.STOP: return #done @@ -29,6 +30,9 @@ class EarlyStopping: if self.checkpoint: self.print(f'[early-stop] improved, saving model in {self.checkpoint}') torch.save(self.model, self.checkpoint) + # with open(self.checkpoint) + # torch.save({'state_dict': self.model.state_dict(), + # 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint) else: self.print(f'[early-stop] improved') self.patience = self.patience_limit @@ -46,6 +50,7 @@ class EarlyStopping: self.patience=self.patience_limit def restore_checkpoint(self): + print(f'restoring best model from epoch {self.best_epoch}...') return torch.load(self.checkpoint) def print(self, msg): diff --git a/src/util/results.py b/src/util/results.py index a889e6d..ec66fc1 100644 --- a/src/util/results.py +++ b/src/util/results.py @@ -5,8 +5,23 @@ import numpy as np class PolylingualClassificationResults: def __init__(self, file, autoflush=True, verbose=False): self.file = file - self.columns = ['method', 'learner', 'embed', 'embed_type', 'pca_s', 'pca_u', 'optimp', 'dataset', 'time', - 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] + self.columns = ['method', + 'learner', + 'optimp', + 'sif', + 'zscore', + 'l2', + 'wescaler', + 'pca', + 'id', + 'dataset', + 'time', + 'lang', + 'macrof1', + 'microf1', + 'macrok', + 'microk', + 'notes'] self.autoflush = autoflush self.verbose = verbose if os.path.exists(file): @@ -21,8 +36,8 @@ class PolylingualClassificationResults: def already_calculated(self, id): return (self.df['id'] == id).any() - def add_row(self, method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) + def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): + s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) self.df = self.df.append(s, ignore_index=True) if self.autoflush: self.flush() self.tell(s.to_string()) diff --git a/src/transformers/StandardizeTransformer.py b/src/util_transformers/StandardizeTransformer.py similarity index 100% rename from src/transformers/StandardizeTransformer.py rename to src/util_transformers/StandardizeTransformer.py diff --git a/src/transformers/__init__.py b/src/util_transformers/__init__.py similarity index 100% rename from src/transformers/__init__.py rename to src/util_transformers/__init__.py diff --git a/src/transformers/clesa.py b/src/util_transformers/clesa.py similarity index 100% rename from src/transformers/clesa.py rename to src/util_transformers/clesa.py diff --git a/src/transformers/dci.py b/src/util_transformers/dci.py similarity index 100% rename from src/transformers/dci.py rename to src/util_transformers/dci.py diff --git a/src/transformers/riboc.py b/src/util_transformers/riboc.py similarity index 100% rename from src/transformers/riboc.py rename to src/util_transformers/riboc.py