baseline multilingual Bert

This commit is contained in:
andrea 2020-07-27 11:56:09 +02:00
parent 22b7ea7e66
commit d1fdad5f6e
37 changed files with 1212 additions and 1112 deletions

View File

@ -1,10 +1,7 @@
import os import os
import pickle
from torchtext.vocab import Vectors from torchtext.vocab import Vectors
import torch import torch
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from embeddings.supervised import get_supervised_embeddings
from util.decompositions import *
from util.SIF_embed import * from util.SIF_embed import *
@ -35,122 +32,10 @@ class PretrainedEmbeddings(ABC):
return source_idx, target_idx return source_idx, target_idx
class WordEmbeddings:
def __init__(self, lang, we, worddim):
self.lang = lang
self.we = we
self.worddim = worddim
self.dimword = {v:k for k,v in self.worddim.items()}
@classmethod
def load(cls, basedir, lang, word_preprocessor=None, dopickle=True):
filename = 'wiki.multi.{}.vec'.format(lang)
we_path = os.path.join(basedir, filename)
if dopickle and os.path.exists(we_path + '.pkl'):
print('loading pkl in {}'.format(we_path + '.pkl'))
(worddim, we) = pickle.load(open(we_path + '.pkl', 'rb'))
else:
word_registry = set()
lines = open(we_path).readlines()
nwords, dims = [int(x) for x in lines[0].split()]
print('reading we of {} dimensions'.format(dims))
we = np.zeros((nwords, dims), dtype=float)
worddim = {}
index = 0
for i, line in enumerate(lines[1:]):
if (i + 1) % 100 == 0:
print('\r{}/{}'.format(i + 1, len(lines)), end='')
word, *vals = line.split()
wordp = word_preprocessor(word) if word_preprocessor is not None else word
if wordp:
wordp = wordp[0]
if wordp in word_registry:
print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp))
elif len(vals) == dims:
worddim[wordp] = index
we[index, :] = np.array(vals).astype(float)
index += 1
# else:
# print('warning: word <{}> generates an empty string after preprocessing'.format(word))
we = we[:index]
print('load {} words'.format(index))
if dopickle:
print('saving...')
pickle.dump((worddim, we), open(we_path + '.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
return WordEmbeddings(lang, we, worddim)
def vocabulary(self):
return set(self.worddim.keys())
def __getitem__(self, key):
return self.we[self.worddim[key]]
def dim(self):
return self.we.shape[1]
def __contains__(self, key):
return key in self.worddim
def most_similar(self, word_vect, k):
if word_vect.ndim == 1:
word_vect = word_vect.reshape(1,-1)
assert word_vect.shape[1] == self.dim(), 'inconsistent dimensions'
sim = np.dot(word_vect,self.we.T)
order = np.argsort(-1*sim, axis=1)[:,:k]
similar_words = [[self.dimword[order[vi,ki]] for ki in range(k)] for vi in range(word_vect.shape[0])]
sim_scores = sim[:,order]
return similar_words, sim_scores
def get_vectors(self, wordlist):
indexes = np.array([self.worddim[w] for w in wordlist])
return self.we[indexes]
def restrict(self, vocabulary):
# vocabulary is a set of terms to be kept
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
lost = len(vocabulary)-len(active_vocabulary)
if lost > 0: # some terms are missing, so it will be replaced by UNK
print('warning: missing {} terms for lang {}'.format(lost, self.lang))
self.we = self.get_vectors(active_vocabulary)
assert self.we.shape[0] == len(active_vocabulary)
self.dimword={i:w for i,w in enumerate(active_vocabulary)}
self.worddim={w:i for i,w in enumerate(active_vocabulary)}
return self
@classmethod
def load_poly(cls, basedir, langs, lang_vocabularies=None, word_preprocessor=None):
if lang_vocabularies is None:
return cls.merge([cls.load(basedir,lang, word_preprocessor) for lang in langs])
else:
# assert all([l in lang_vocabularies for l in langs]), 'missing vocabulary for some languages'
return cls.merge([cls.load(basedir, lang, word_preprocessor).restrict(lang_vocabularies[lang]) for lang in langs])
@classmethod
def merge(cls, we_list):
assert all([isinstance(we, WordEmbeddings) for we in we_list]), \
'instances of {} expected'.format(WordEmbeddings.__name__)
polywe = []
worddim = {}
offset = 0
for we in we_list:
polywe.append(we.we)
worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
offset = len(worddim)
polywe = np.vstack(polywe)
return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
class FastTextWikiNews(Vectors): class FastTextWikiNews(Vectors):
url_base = 'Cant auto-download MUSE embeddings' url_base = 'Cant auto-download MUSE embeddings'
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec' path = '../embeddings/wiki.multi.{}.vec'
_name = '/wiki.multi.{}.vec' _name = '/wiki.multi.{}.vec'
def __init__(self, cache, language="en", **kwargs): def __init__(self, cache, language="en", **kwargs):
@ -159,42 +44,13 @@ class FastTextWikiNews(Vectors):
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
class EmbeddingsAligned(Vectors):
def __init__(self, type, path, lang, voc):
# todo - rewrite as relative path
self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
self.path = path + self.name.format(lang)
assert os.path.exists(path), f'pre-trained vectors not found in {path}'
super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
self.vectors = self.extract(voc)
def vocabulary(self):
return set(self.stoi.keys())
def extract(self, words):
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
extraction = torch.zeros((len(words), self.dim))
extraction[source_idx] = self.vectors[target_idx]
return extraction
def reduce(self, dim):
pca = PCA(n_components=dim)
self.vectors = pca.fit_transform(self.vectors)
return
class FastTextMUSE(PretrainedEmbeddings): class FastTextMUSE(PretrainedEmbeddings):
def __init__(self, path, lang, limit=None): def __init__(self, path, lang, limit=None):
super().__init__() super().__init__()
print(f'Loading fastText pretrained vectors for language {lang} from {path}') print(f'Loading fastText pretrained vectors for language {lang} from {path}')
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
self.embed = FastTextWikiNews(path, lang, max_vectors=limit) self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
def vocabulary(self): def vocabulary(self):
return set(self.embed.stoi.keys()) return set(self.embed.stoi.keys())
@ -204,114 +60,8 @@ class FastTextMUSE(PretrainedEmbeddings):
def extract(self, words): def extract(self, words):
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
extraction = torch.zeros((len(words), self.dim())) extraction = torch.zeros((len(words), self.dim()))
# extraction = torch.empty(len(words), self.dim()).normal_(0, 1)
extraction[source_idx] = self.embed.vectors[target_idx] extraction[source_idx] = self.embed.vectors[target_idx]
return extraction return extraction
class StorageEmbeddings:
def __init__(self, path):
self.path = path
self.lang_U = dict()
self.lang_S = dict()
def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
for lang in docs.keys():
print(f'# [unsupervised-matrix {type}] for {lang}')
voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
nC = self.lang_U[lang].shape[1]
if max_label_space == 0:
print(f'Computing optimal number of PCA components along matrices U')
optimal_n = get_optimal_dim(self.lang_U, 'U')
self.lang_U = run_pca(optimal_n, self.lang_U)
elif max_label_space < nC:
print(f'Applying PCA to unsupervised matrix U')
self.lang_U = run_pca(max_label_space, self.lang_U)
return
def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
only_well_represented_C = False # TODO testing
if only_well_represented_C:
labels = labels.copy()
min_prevalence = 0
print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...')
langs = list(docs.keys())
well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs])
for lang in langs:
labels[lang] = labels[lang][:, well_repr_cats]
print(f'Target number reduced to: {labels[lang].shape[1]}\n')
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
print(f'# [supervised-matrix] for {lang}')
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
reduction, max_label_space, voc[lang], lang)
nC = self.lang_S[lang].shape[1]
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
if max_label_space == 0: # looking for best n_components analyzing explained_variance_ratio
print(f'Computing optimal number of PCA components along matrices S')
optimal_n = get_optimal_dim(self.lang_S, 'S')
print(f'Applying PCA(n_components={optimal_n})')
self.lang_S = run_pca(optimal_n, self.lang_S)
elif max_label_space == -1: # applying pca to the verticals stacked matrix of WCE embeddings
print(f'Computing PCA on vertical stacked WCE embeddings')
languages = self.lang_S.keys()
_temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) # stacking WCE vertically
stacked_pca = PCA(n_components=_temp_stack.shape[1])
stacked_pca.fit(_temp_stack)
best_n = None
_r = stacked_pca.explained_variance_ratio_
_r = np.cumsum(_r)
plt.plot(_r, label='Stacked Supervised')
for i in range(len(_r) - 1, 1, -1):
delta = _r[i] - _r[i - 1]
if delta > 0:
best_n = i
break
plt.show()
stacked_pca = PCA(n_components=best_n)
stacked_pca.fit(_temp_stack)
print(f'Applying PCA(n_components={i}')
for lang in languages:
self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
elif max_label_space <= nC: # less or equal in order to reduce it to the same initial dimension
print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})')
self.lang_S = run_pca(max_label_space, self.lang_S)
return
def SIF_embeddings(self):
print('todo') # TODO
def _concatenate_embeddings(self, docs):
_r = dict()
for lang in self.lang_U.keys():
_r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang])))
return _r
def fit(self, config, docs, vocs, labels):
if config['unsupervised']:
self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
if config['supervised']:
self._add_embeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
return self
def predict(self, config, docs):
if config['supervised'] and config['unsupervised']:
return self._concatenate_embeddings(docs)
# todo testing applying pca to hstack muse + wce
# _reduced = self._concatenate_embeddings(docs)
# return run_pca(300, _reduced)
elif config['supervised']:
_r = dict()
for lang in docs.keys():
_r[lang] = docs[lang].dot(self.lang_S[lang])
else:
_r = dict()
for lang in docs.keys():
_r[lang] = docs[lang].dot(self.lang_U[lang])
return _r

View File

@ -1,103 +1,102 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import torch, torchtext import torch, torchtext
import gensim # import gensim
import os # import os
import numpy as np import numpy as np
class KeyedVectors: # class KeyedVectors:
#
def __init__(self, word2index, weights): # def __init__(self, word2index, weights):
assert len(word2index)==weights.shape[0], 'wrong number of dimensions' # assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
index2word = {i:w for w,i in word2index.items()} # index2word = {i:w for w,i in word2index.items()}
assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed' # assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
self.word2index = word2index # self.word2index = word2index
self.index2word = index2word # self.index2word = index2word
self.weights = weights # self.weights = weights
#
def extract(self, words): # def extract(self, words):
dim = self.weights.shape[1] # dim = self.weights.shape[1]
v_size = len(words) # v_size = len(words)
#
source_idx, target_idx = [], [] # source_idx, target_idx = [], []
for i,word in enumerate(words): # for i,word in enumerate(words):
if word not in self.word2index: continue # if word not in self.word2index: continue
j = self.word2index[word] # j = self.word2index[word]
source_idx.append(i) # source_idx.append(i)
target_idx.append(j) # target_idx.append(j)
#
extraction = np.zeros((v_size, dim)) # extraction = np.zeros((v_size, dim))
extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)] # extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
#
return extraction # return extraction
# class PretrainedEmbeddings(ABC):
class PretrainedEmbeddings(ABC): #
# def __init__(self):
def __init__(self): # super().__init__()
super().__init__() #
# @abstractmethod
@abstractmethod # def vocabulary(self): pass
def vocabulary(self): pass #
# @abstractmethod
@abstractmethod # def dim(self): pass
def dim(self): pass #
# @classmethod
@classmethod # def reindex(cls, words, word2index):
def reindex(cls, words, word2index): # source_idx, target_idx = [], []
source_idx, target_idx = [], [] # for i, word in enumerate(words):
for i, word in enumerate(words): # if word not in word2index: continue
if word not in word2index: continue # j = word2index[word]
j = word2index[word] # source_idx.append(i)
source_idx.append(i) # target_idx.append(j)
target_idx.append(j) # source_idx = np.asarray(source_idx)
source_idx = np.asarray(source_idx) # target_idx = np.asarray(target_idx)
target_idx = np.asarray(target_idx) # return source_idx, target_idx
return source_idx, target_idx
class GloVe(PretrainedEmbeddings): # class GloVe(PretrainedEmbeddings):
#
def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None): # def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
super().__init__() # super().__init__()
print(f'Loading GloVe pretrained vectors from torchtext') # print(f'Loading GloVe pretrained vectors from torchtext')
self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors) # self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
print('Done') # print('Done')
#
def vocabulary(self): # def vocabulary(self):
return set(self.embed.stoi.keys()) # return set(self.embed.stoi.keys())
#
def dim(self): # def dim(self):
return self.embed.dim # return self.embed.dim
#
def extract(self, words): # def extract(self, words):
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) # source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
extraction = torch.zeros((len(words), self.dim())) # extraction = torch.zeros((len(words), self.dim()))
extraction[source_idx] = self.embed.vectors[target_idx] # extraction[source_idx] = self.embed.vectors[target_idx]
return extraction # return extraction
class Word2Vec(PretrainedEmbeddings): # class Word2Vec(PretrainedEmbeddings):
#
def __init__(self, path, limit=None): # def __init__(self, path, limit=None):
super().__init__() # super().__init__()
print(f'Loading word2vec pretrained vectors from {path}') # print(f'Loading word2vec pretrained vectors from {path}')
assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}') # assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit) # self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
self.word2index={w:i for i,w in enumerate(self.embed.index2word)} # self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
print('Done') # print('Done')
#
def vocabulary(self): # def vocabulary(self):
return set(self.word2index.keys()) # return set(self.word2index.keys())
#
def dim(self): # def dim(self):
return self.embed.vector_size # return self.embed.vector_size
#
def extract(self, words): # def extract(self, words):
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index) # source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
extraction = np.zeros((len(words), self.dim())) # extraction = np.zeros((len(words), self.dim()))
extraction[source_idx] = self.embed.vectors[target_idx] # extraction[source_idx] = self.embed.vectors[target_idx]
extraction = torch.from_numpy(extraction).float() # extraction = torch.from_numpy(extraction).float()
return extraction # return extraction

View File

@ -1,7 +1,5 @@
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
import numpy as np import numpy as np
# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
@ -69,31 +67,6 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la
return F return F
# if nC >= max_label_space:
# if reduction == 'PCA':
# if max_label_space == 0:
# pca = PCA(n_components=Y.shape[1])
# pca = pca.fit(F)
# return pca.explained_variance_ratio_
#
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
# f'Applying PCA(n_components={max_label_space})')
# pca = PCA(n_components=max_label_space)
# F = pca.fit_transform(F)
# elif reduction == 'TSNE':
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
# f'Applying t-SNE(n_components={max_label_space})')
# tsne = TSNE(n_components=max_label_space)
# F = tsne.fit_transform(F)
# elif reduction == 'tSVD':
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
# f'Applying truncatedSVD(n_components={max_label_space})')
# tSVD = TruncatedSVD(n_components=max_label_space)
# F = tSVD.fit_transform(F)
#
# return F

View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
logfile=../log/log10run_dl_jrc.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
done

View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
logfile=../log/log10run_dl_rcv.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
done

View File

@ -0,0 +1,12 @@
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
logfile=./results/10run_jrc_final_results.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
done

View File

@ -0,0 +1,16 @@
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
logfile=./results/funnelling_10run_jrc_CIKM.csv
runs='6 7 8 9' #0 1 2 3 4 5
for run in $runs
do
dataset=$dataset_path$run.pickle
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated (done up to run5)
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2
done

View File

@ -0,0 +1,15 @@
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
logfile=./results/10run_rcv_final_results.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
done

View File

@ -0,0 +1,16 @@
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2
done

View File

@ -0,0 +1,34 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
logfile=./results/final_combinations_jrc.csv
#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
# aggregation=concatenation
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2
#
##FeatureSetToPosteriors (aggregation mean)
python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
##FeatureSetToPosteriors
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
#MajorityVoting
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r

View File

@ -0,0 +1,31 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
logfile=./results/final_combinations_rcv.csv
#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
# aggregation=concatenation
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2
#
##FeatureSetToPosteriors (aggregation mean)
python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
##FeatureSetToPosteriors
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
#MajorityVoting
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r

View File

@ -0,0 +1,31 @@
#!/usr/bin/env bash
logfile=../log/log_pre_jrc.csv
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20

View File

@ -0,0 +1,30 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20

View File

@ -0,0 +1,16 @@
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
seeds='5' #2 3 4 5 6 7 8 9 10'
for seed in $seeds
do
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed --force
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed --force
done

View File

@ -0,0 +1,20 @@
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
seeds='1 ' #2 3 4 5' # 6 7 8 9 10'
for seed in $seeds
do
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed
# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed
# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed
done

View File

@ -0,0 +1,45 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
######################################## POSTERIORS
# Posteriors
python main_multimodal_cls.py $dataset -P # + zscore
python main_multimodal_cls.py $dataset -P -z # +l2norm
python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
######################################### WCE
#WCE supervised
python main_multimodal_cls.py $dataset -S # + zscore
python main_multimodal_cls.py $dataset -S -z # +l2norm
python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 # +feature weight + pca
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 # + SIF
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig # + pca
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi
################################# MUSE
# MUSE unsupervised
python main_multimodal_cls.py $dataset -U # + zscore
python main_multimodal_cls.py $dataset -U -z # +l2norm
python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi

View File

@ -0,0 +1,45 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
######################################## POSTERIORS
# Posteriors
python main_multimodal_cls.py $dataset -P # + zscore
python main_multimodal_cls.py $dataset -P -z # +l2norm
python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
######################################### WCE
#WCE supervised
python main_multimodal_cls.py $dataset -S # + zscore
python main_multimodal_cls.py $dataset -S -z # +l2norm
python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 # +feature weight + pca
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 # + SIF
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig # + pca
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi
################################# MUSE
# MUSE unsupervised
python main_multimodal_cls.py $dataset -U # + zscore
python main_multimodal_cls.py $dataset -U -z # +l2norm
python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi

View File

@ -0,0 +1,6 @@
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
seeds='1 2 3 4 5 6 7 8 9 10'
for seed in $seeds
do
python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised --nepochs 50 --seed $seed
done

View File

@ -1,15 +1,15 @@
import numpy as np import numpy as np
import time import time
from embeddings.embeddings import WordEmbeddings, StorageEmbeddings # from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
from scipy.sparse import issparse from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold # from sklearn.model_selection import KFold
from joblib import Parallel, delayed from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer # from sklearn.feature_extraction.text import TfidfVectorizer
from transformers.StandardizeTransformer import StandardizeTransformer # from util_transformers.StandardizeTransformer import StandardizeTransformer
from sklearn.decomposition import PCA # from sklearn.decomposition import PCA
from models.cnn_class_bu import CNN_pdr # from models.cnn_class_bu import CNN_pdr
def _sort_if_sparse(X): def _sort_if_sparse(X):
@ -40,154 +40,154 @@ class TrivialRejector:
def best_params(self): return {} def best_params(self): return {}
class FunnellingPolylingualClassifier: # class FunnellingPolylingualClassifier:
""" # """
This classifier projects each document d into a language-independent feature space where each dimension fi is the # This classifier projects each document d into a language-independent feature space where each dimension fi is the
decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l; # decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
then trains one single classifier for all documents in this space, irrespective of their originary language # then trains one single classifier for all documents in this space, irrespective of their originary language
""" # """
def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1, # def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
calmode='cal', n_jobs=-1): # calmode='cal', n_jobs=-1):
""" # """
:param first_tier_learner: the learner used in the first-tier level # :param first_tier_learner: the learner used in the first-tier level
:param meta_learner: the learner used in the second-tier level # :param meta_learner: the learner used in the second-tier level
:param first_tier_parameters: parameters for the learner in the doc_projector # :param first_tier_parameters: parameters for the learner in the doc_projector
:param meta_parameters: parameters for the learner in the z-space # :param meta_parameters: parameters for the learner in the z-space
:param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and # :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
:param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or # :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
:param n_jobs: number of parallel threads # :param n_jobs: number of parallel threads
'sigmoid' to use the sigmoid of the decision_function # 'sigmoid' to use the sigmoid of the decision_function
projects the data before training the final classifier; if greater than one, the training set is split in as # projects the data before training the final classifier; if greater than one, the training set is split in as
many folds as indicated, and the projected space is composed by concatenating each fold prediction based on # many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
models trained on the remaining folds. This should increase the generality of the space to unseen data. # models trained on the remaining folds. This should increase the generality of the space to unseen data.
""" # """
assert folded_projections>0, "positive number of folds expected" # assert folded_projections>0, "positive number of folds expected"
assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode' # assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True' # assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
#
self.fist_tier_learner = first_tier_learner # self.fist_tier_learner = first_tier_learner
self.meta_learner = meta_learner # self.meta_learner = meta_learner
self.fist_tier_parameters=first_tier_parameters # self.fist_tier_parameters=first_tier_parameters
self.meta_parameters = meta_parameters # self.meta_parameters = meta_parameters
self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs) # self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs) # self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
self.folded_projections = folded_projections # self.folded_projections = folded_projections
self.n_jobs = n_jobs # self.n_jobs = n_jobs
self.calmode = calmode # self.calmode = calmode
#
def _projection(self, doc_projector, lX): # def _projection(self, doc_projector, lX):
""" # """
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or # Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
decision_function if otherwise # decision_function if otherwise
:param doc_projector: the document projector (a NaivePolylingualClassifier) # :param doc_projector: the document projector (a NaivePolylingualClassifier)
:param lX: {lang:matrix} to train # :param lX: {lang:matrix} to train
:return: the projection, applied with predict_proba or decision_function # :return: the projection, applied with predict_proba or decision_function
""" # """
if self.calmode=='cal': # if self.calmode=='cal':
return doc_projector.predict_proba(lX) # return doc_projector.predict_proba(lX)
else: # else:
l_decision_scores = doc_projector.decision_function(lX) # l_decision_scores = doc_projector.decision_function(lX)
if self.calmode=='sigmoid': # if self.calmode=='sigmoid':
def sigmoid(x): return 1 / (1 + np.exp(-x)) # def sigmoid(x): return 1 / (1 + np.exp(-x))
for lang in l_decision_scores.keys(): # for lang in l_decision_scores.keys():
l_decision_scores[lang] = sigmoid(l_decision_scores[lang]) # l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
return l_decision_scores # return l_decision_scores
#
def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None): # def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
""" # """
Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of # Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
decision scores (if otherwise). This space is here named zspace. # decision scores (if otherwise). This space is here named zspace.
:param lXtr: {lang:matrix} to train # :param lXtr: {lang:matrix} to train
:param lYtr: {lang:labels} to train # :param lYtr: {lang:labels} to train
:param lXproj: {lang:matrix} to project (if None, then projects the lXtr) # :param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
:param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked) # :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
:return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific # :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
models trained on lXtr, and the lYproj labels stacked consistently # models trained on lXtr, and the lYproj labels stacked consistently
""" # """
repair_empty_folds = True # repair_empty_folds = True
if lXproj is None and lYproj is None: # if lXproj is None and lYproj is None:
lXproj, lYproj = lXtr, lYtr # lXproj, lYproj = lXtr, lYtr
repair_empty_folds = False # repair_empty_folds = False
#
print('fitting the projectors... {}'.format(lXtr.keys())) # print('fitting the projectors... {}'.format(lXtr.keys()))
self.doc_projector.fit(lXtr, lYtr) # self.doc_projector.fit(lXtr, lYtr)
#
print('projecting the documents') # print('projecting the documents')
langs = list(lXtr.keys()) # langs = list(lXtr.keys())
lZ = self._projection(self.doc_projector, lXproj) # lZ = self._projection(self.doc_projector, lXproj)
#
# if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version # # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
empty_categories = self.doc_projector.empty_categories # empty_categories = self.doc_projector.empty_categories
lZ_bu = self._projection(self.doc_projector_bu, lXproj) # lZ_bu = self._projection(self.doc_projector_bu, lXproj)
#
for lang in langs: # for lang in langs:
repair = empty_categories[lang] # repair = empty_categories[lang]
lZ[lang][:,repair] = lZ_bu[lang][:,repair] # lZ[lang][:,repair] = lZ_bu[lang][:,repair]
#
Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space # Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
zy = np.vstack([lYproj[lang] for lang in langs]) # zy = np.vstack([lYproj[lang] for lang in langs])
return Z, zy # return Z, zy
#
def _get_zspace_folds(self, lX, ly): # def _get_zspace_folds(self, lX, ly):
self.doc_projector_bu.fit(lX, ly) # self.doc_projector_bu.fit(lX, ly)
#
print('split of {} folds'.format(self.folded_projections)) # print('split of {} folds'.format(self.folded_projections))
skf = KFold(n_splits=self.folded_projections, shuffle=True) # skf = KFold(n_splits=self.folded_projections, shuffle=True)
#
Z, zy = [], [] # Z, zy = [], []
lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()} # lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
for fold in range(self.folded_projections): # for fold in range(self.folded_projections):
print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections)) # print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
lfoldXtr, lfoldYtr = {}, {} # lfoldXtr, lfoldYtr = {}, {}
lfoldXte, lfoldYte = {}, {} # lfoldXte, lfoldYte = {}, {}
for lang in lX.keys(): # for lang in lX.keys():
train, test = lfold[lang][fold] # train, test = lfold[lang][fold]
lfoldXtr[lang] = lX[lang][train] # lfoldXtr[lang] = lX[lang][train]
lfoldYtr[lang] = ly[lang][train] # lfoldYtr[lang] = ly[lang][train]
lfoldXte[lang] = lX[lang][test] # lfoldXte[lang] = lX[lang][test]
lfoldYte[lang] = ly[lang][test] # lfoldYte[lang] = ly[lang][test]
Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte) # Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
Z.append(Zfold) # Z.append(Zfold)
zy.append(zYfold) # zy.append(zYfold)
# compose the Z-space as the union of all folded predictions # # compose the Z-space as the union of all folded predictions
Z = np.vstack(Z) # Z = np.vstack(Z)
zy = np.vstack(zy) # zy = np.vstack(zy)
# refit the document projector with all examples to have a more reliable projector for test data # # refit the document projector with all examples to have a more reliable projector for test data
self.doc_projector = self.doc_projector_bu # self.doc_projector = self.doc_projector_bu
return Z, zy # return Z, zy
#
def fit(self, lX, ly, lZ=None, lzy=None): # def fit(self, lX, ly, lZ=None, lzy=None):
tinit = time.time() # tinit = time.time()
Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly) # Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
#
#experimental: adds the posterior probabilities (computed outside) to the meta-classifier # #experimental: adds the posterior probabilities (computed outside) to the meta-classifier
if lZ is not None and lzy is not None: # if lZ is not None and lzy is not None:
zlangs = list(lZ.keys()) # zlangs = list(lZ.keys())
Z = np.vstack((Z, *[lZ[l] for l in zlangs])) # Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
zy = np.vstack((zy, *[lzy[l] for l in zlangs])) # zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
#
print('fitting the Z-space of shape={}'.format(Z.shape)) # print('fitting the Z-space of shape={}'.format(Z.shape))
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs) # self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
self.model.fit(Z, zy) # self.model.fit(Z, zy)
self.time = time.time() - tinit # self.time = time.time() - tinit
#
return self # return self
#
def predict(self, lX, lZ=None): # def predict(self, lX, lZ=None):
""" # """
:param lX: a dictionary {language_label: X csr-matrix} # :param lX: a dictionary {language_label: X csr-matrix}
:param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation # :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
:return: a dictionary of predictions # :return: a dictionary of predictions
""" # """
lZ_ = self._projection(self.doc_projector, lX) # lZ_ = self._projection(self.doc_projector, lX)
if lZ is not None: # if lZ is not None:
lZ_ = {**lZ_, **lZ} # lZ_ = {**lZ_, **lZ}
return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs) # return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
#
def best_params(self): # def best_params(self):
params = self.doc_projector.best_params() # params = self.doc_projector.best_params()
params['meta'] = self.model.best_params() # params['meta'] = self.model.best_params()
return params # return params
class NaivePolylingualClassifier: class NaivePolylingualClassifier:
@ -322,411 +322,4 @@ class MonolingualClassifier:
return self.model.predict(X) return self.model.predict(X)
def best_params(self): def best_params(self):
return self.best_params_ return self.best_params_
class FunnellingMultimodal(FunnellingPolylingualClassifier):
def __init__(self,
we_path,
config,
first_tier_learner,
meta_learner,
first_tier_parameters=None,
meta_parameters=None,
folded_projections=1,
calmode='cal',
n_jobs=-1):
super().__init__(first_tier_learner,
meta_learner,
first_tier_parameters,
meta_parameters,
folded_projections,
calmode,
n_jobs)
self.pca_independent_space = PCA(n_components=50)
self.we_path = we_path
self.config = config
self.lang_word2idx = dict()
self.languages = []
self.lang_tfidf = {}
self.embedding_space = None
self.model = None
self.time = None
self.best_components = 'not set' # if auto optimize pca, it will store the optimal number of components
def vectorize(self, lX, prediction=False):
langs = list(lX.keys())
print(f'# tfidf-vectorizing docs')
if prediction:
for lang in langs:
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
tfidf_vectorizer = self.lang_tfidf[lang]
lX[lang] = tfidf_vectorizer.transform(lX[lang])
return self
for lang in langs:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
self.languages.append(lang)
tfidf_vectorizer.fit(lX[lang])
lX[lang] = tfidf_vectorizer.transform(lX[lang])
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
self.lang_tfidf[lang] = tfidf_vectorizer
return self
def _get_zspace(self, lXtr, lYtr):
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
self.doc_projector.fit(lXtr, lYtr)
print('\nprojecting the documents')
lZ = self._projection(self.doc_projector, lXtr)
return lZ, lYtr
def fit(self, lX, ly):
tinit = time.time()
print('Vectorizing documents...')
self.vectorize(lX)
for lang in self.languages:
print(f'{lang}->{lX[lang].shape}')
Z, zy = self._get_zspace(lX, ly)
if self.config['supervised'] or self.config['unsupervised']:
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
_embedding_space = self.embedding_space.transform(self.config, lX)
if self.config['max_label_space'] == 0:
_cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1]
if _cum_dimension - 300 > 0:
_temp = _cum_dimension - 300
else:
_temp = _cum_dimension
self.best_components = _temp
# h_stacking posterior probabilities with (U) and/or (S) matrices
for lang in self.languages:
Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
# stacking Z space vertically
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
self.standardizer = StandardizeTransformer()
_vertical_Z = self.standardizer.fit_transform(_vertical_Z)
# todo testing ...
# if self.config['post_pca']:
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
# self.pca_independent_space.fit(_vertical_Z)
# _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
n_jobs=self.n_jobs)
self.model.fit(_vertical_Z, _vertical_Zy)
self.time = time.time() - tinit
print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
def predict(self, lX, ly):
print('Vectorizing documents')
self.vectorize(lX, prediction=True)
lZ = self._projection(self.doc_projector, lX)
if self.config['supervised'] or self.config['unsupervised']:
_embedding_space = self.embedding_space.transform(self.config, lX)
for lang in lX.keys():
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
for lang in lZ.keys():
print(lZ[lang].shape)
# todo testing
lZ[lang] = self.standardizer.transform(lZ[lang])
# if self.config['post_pca']:
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
class PolylingualEmbeddingsClassifier:
"""
This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article
@article{conneau2017word,
title={Word translation without parallel data},
author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}},
journal={arXiv preprint arXiv:1710.04087},
year={2017}
}
url: https://github.com/facebookresearch/MUSE
"""
def __init__(self, wordembeddings_path, config, learner, c_parameters=None, n_jobs=-1):
"""
:param wordembeddings_path: the path to the directory containing the polylingual embeddings
:param learner: the learner
:param c_parameters: parameters for learner
:param n_jobs: the number of concurrent threads
"""
self.wordembeddings_path = wordembeddings_path
self.config = config
self.learner = learner
self.c_parameters=c_parameters
self.n_jobs = n_jobs
self.lang_tfidf = {}
self.model = None
self.languages = []
self.lang_word2idx = dict()
self.embedding_space = None
def fit_vectorizers(self, lX):
for lang in lX.keys():
if lang not in self.lang_tfidf:
tfidf = TfidfVectorizer(sublinear_tf=True, use_idf=True) # text is already processed
docs = lX[lang]
tfidf.fit(docs)
self.lang_tfidf[lang] = tfidf
def vectorize(self, lX, prediction=False):
langs = list(lX.keys())
print(f'# tfidf-vectorizing docs')
if prediction:
for lang in langs:
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
tfidf_vectorizer = self.lang_tfidf[lang]
lX[lang] = tfidf_vectorizer.transform(lX[lang])
return self
for lang in langs:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
self.languages.append(lang)
tfidf_vectorizer.fit(lX[lang])
lX[lang] = tfidf_vectorizer.transform(lX[lang])
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
self.lang_tfidf[lang] = tfidf_vectorizer
return self
def embed(self, docs, lang):
assert lang in self.lang_tfidf, 'unknown language'
tfidf_vectorizer = self.lang_tfidf[lang]
V = tfidf_vectorizer.vocabulary_
Xweights = tfidf_vectorizer.transform(docs)
print('loading word embeddings for ' + lang)
we = WordEmbeddings.load(self.wordembeddings_path, lang)
nD = len(docs)
doc_vecs = np.zeros((nD, we.dim()))
for i, doc in enumerate(docs):
print('\r\tcomplete {:.3f}%'.format(100 * (i + 1) / nD), end='')
# averaging with tfidf (summing each word only once, since the frequency is already controlled)
for w in set(doc.split()):
if w in we and w in V:
doc_vecs[i] += (we[w] * Xweights[i, V[w]])
# works much worse with idf; works much worse with document l2-normalization
print()
return doc_vecs
def fit(self, lX, ly):
"""
:param lX: a dictionary {language_label: [list of preprocessed documents]}
:param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
:return: self
"""
tinit = time.time()
langs = list(lX.keys())
WEtr, Ytr = [], []
# self.fit_vectorizers(lX) # if already fit, does nothing
self.vectorize(lX)
# config = {'unsupervised' : False, 'supervised': True}
self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly)
WEtr = self.embedding_space.transform(self.config, lX)
# for lang in langs:
# WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices
# Ytr.append(ly[lang])
WEtr = np.vstack([WEtr[lang] for lang in langs])
Ytr = np.vstack([ly[lang] for lang in langs])
self.embed_time = time.time() - tinit
print('fitting the WE-space of shape={}'.format(WEtr.shape))
self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs)
self.model.fit(WEtr, Ytr)
self.time = time.time() - tinit
return self
def predict(self, lX, lY):
"""
:param lX: a dictionary {language_label: [list of preprocessed documents]}
"""
assert self.model is not None, 'predict called before fit'
self.vectorize(lX, prediction=True)
langs = list(lX.keys())
lWEte = self.embedding_space.transform(self.config, lX)
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
return _joblib_transform_multiling(self.model.transform, lWEte, n_jobs=self.n_jobs)
def predict_proba(self, lX):
"""
:param lX: a dictionary {language_label: [list of preprocessed documents]}
"""
assert self.model is not None, 'predict called before fit'
langs = list(lX.keys())
lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
return _joblib_transform_multiling(self.model.predict_proba, lWEte, n_jobs=self.n_jobs)
def best_params(self):
return self.model.best_params()
class MonolingualNetSvm:
"""
testing: funnelling with NN managing word embeddings compositionality. An ensemble of n-SVMs (n equals to the
number of training languages) is first fit on the data, generating the documents projection in the Z-space. Next,
the projection are fed to a single NN with their respective document embeddings. The documents are projected into
the embedding space while preserving their dimensionality (output dim is 300). These projection are horizonatally
concatenated with the respective projection and passed through a fC layer with sigmoid act and output dim equal
to the number of target classes.
# TODO ATM testing with only 1 language
"""
def __init__(self, lX, ly, first_tier_learner, first_tier_parameters, n_jobs):
self.lX = lX
self.ly = ly
# SVM Attributes
self.doc_projector = NaivePolylingualClassifier(first_tier_learner, first_tier_parameters,
n_jobs=n_jobs)
self.calmode = 'cal'
self.languages = []
self.lang_word2idx = dict()
self.lang_tfidf = {}
self.base_learner = 'TODO'
self.parameters = 'TODO'
# NN Attributes
self.NN = 'TODO'
def load_preprocessed(self):
"""
in order to speed up the process, documents are first tokenized in the "main". Here, tokenized docs, word_index, and
targets are loaded.
:return: dict[lang] = (word_index, tokenized_docs, targets)
"""
import pickle
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
return pickle.load(f)
def _build_embedding_matrix(self, lang, word_index):
"""
build embedding matrix by filtering out OOV embeddings
:param lang:
:param word_index:
:return: filtered embedding matrix
"""
from embeddings.embeddings import EmbeddingsAligned
type = 'MUSE'
path = '/home/andreapdr/CLESA/'
MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
return MUSE
def get_data_and_embed(self, data_dict):
from keras.preprocessing.sequence import pad_sequences
langs = data_dict.keys()
lang_embedding_matrix = dict()
nn_lXtr = dict()
nn_lytr = dict()
for lang in langs:
lang_embedding_matrix[lang] = self._build_embedding_matrix(lang, data_dict[lang][0])
nn_lXtr[lang] = pad_sequences(data_dict[lang][1], 100, padding='post')
nn_lytr[lang] = [data_dict[lang][2]]
return nn_lXtr, nn_lytr, lang_embedding_matrix
def svm_vectorize(self, lX, prediction=False):
langs = list(lX.keys())
print(f'# tfidf-vectorizing docs')
if prediction:
for lang in langs:
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
tfidf_vectorizer = self.lang_tfidf[lang]
lX[lang] = tfidf_vectorizer.transform(lX[lang])
return self
for lang in langs:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
self.languages.append(lang)
tfidf_vectorizer.fit(lX[lang])
lX[lang] = tfidf_vectorizer.transform(lX[lang])
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
self.lang_tfidf[lang] = tfidf_vectorizer
return lX
def _get_zspace(self, lXtr, lYtr):
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
self.doc_projector.fit(lXtr, lYtr)
print('\nprojecting the documents')
lZ = self._projection(self.doc_projector, lXtr)
return lZ, lYtr
def _projection(self, doc_projector, lX):
"""
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
decision_function if otherwise
:param doc_projector: the document projector (a NaivePolylingualClassifier)
:param lX: {lang:matrix} to train
:return: the projection, applied with predict_proba or decision_function
"""
if self.calmode=='cal':
return doc_projector.predict_proba(lX)
else:
l_decision_scores = doc_projector.decision_function(lX)
if self.calmode=='sigmoid':
def sigmoid(x): return 1 / (1 + np.exp(-x))
for lang in l_decision_scores.keys():
l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
return l_decision_scores
def fit(self):
"""
# 1. Fit SVM to generate posterior probabilities:
# 1.1 Gather documents and vectorize them as in other SVM classifiers
# 2. Fit NN
# 2.1 Gather documents and build NN dataset by indexing wrt embedding matrix
# 2.2 Fit NN first-layer to generate compositional doc embedding
# 2.3 H-stack doc-embed and posterior P
# 2.4 Feed stacked vector to output layer (sigmoid act): output Nc
# 2.5 Train it...
"""
# load pre-processed data
data_dict = self.load_preprocessed()
# build embedding matrices and neural network document training set
nn_lXtr, nn_lytr, lang_embedding_matrix = self.get_data_and_embed(data_dict)
# TF-IDF vectorzing documents for SVM classifier
svm_lX = self.svm_vectorize(self.lX)
# just testing on a smaller subset of data
test_svm_lX = dict()
test_svm_ly = dict()
test_svm_lX['it'] = svm_lX['it'][:10, :]
test_svm_ly['it'] = self.ly['it'][:10, :]
test_nn_data = nn_lXtr['it'][:10]
# projecting document into Z space by SVM
svm_Z, _ = self._get_zspace(test_svm_lX, test_svm_ly)
# initializing net and forward pass
net = CNN_pdr(73, 1, 300, len(lang_embedding_matrix['it'].vectors), 300, lang_embedding_matrix['it'].vectors)
out = net.forward(test_nn_data, svm_Z['it'])
print('TODO')
def net(self):
pass

View File

@ -10,7 +10,7 @@ import time
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from joblib import Parallel, delayed from joblib import Parallel, delayed
from scipy.sparse import issparse, vstack, hstack from scipy.sparse import issparse, vstack, hstack
from transformers.StandardizeTransformer import StandardizeTransformer from util_transformers.StandardizeTransformer import StandardizeTransformer
from util.SIF_embed import remove_pc from util.SIF_embed import remove_pc
from sklearn.preprocessing import normalize from sklearn.preprocessing import normalize
from sklearn.svm import SVC from sklearn.svm import SVC
@ -127,22 +127,26 @@ class PosteriorProbabilitiesEmbedder:
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents') print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents')
return self.doc_projector.predict_proba(lX) return self.doc_projector.predict_proba(lX)
def _get_output_dim(self):
return len(self.doc_projector.model['da'].model.classes_)
class MuseEmbedder: class MuseEmbedder:
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight()): def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False):
self.path=path self.path=path
self.lV = lV self.lV = lV
self.l2 = l2 self.l2 = l2
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.featureweight = featureweight self.featureweight = featureweight
self.sif = sif
def fit(self, lX, ly, lV=None): def fit(self, lX, ly, lV=None):
assert lV is not None or self.lV is not None, 'lV not specified' assert lV is not None or self.lV is not None, 'lV not specified'
self.langs = sorted(lX.keys()) self.langs = sorted(lX.keys())
self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs) self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs} lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE} self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE.items()}
self.featureweight.fit(lX, ly) self.featureweight.fit(lX, ly)
return self return self
@ -150,7 +154,7 @@ class MuseEmbedder:
MUSE = self.MUSE MUSE = self.MUSE
lX = self.featureweight.transform(lX) lX = self.featureweight.transform(lX)
XdotMUSE = Parallel(n_jobs=self.n_jobs)( XdotMUSE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], MUSE[lang]) for lang in self.langs delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs
) )
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)} lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
lMuse = _normalize(lMuse, self.l2) lMuse = _normalize(lMuse, self.l2)
@ -162,14 +166,18 @@ class MuseEmbedder:
def _get_wordlist_from_word2index(self, word2index): def _get_wordlist_from_word2index(self, word2index):
return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0] return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0]
def _get_output_dim(self):
return self.MUSE['da'].shape[1]
class WordClassEmbedder: class WordClassEmbedder:
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight()): def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False):
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.l2 = l2 self.l2 = l2
self.max_label_space=max_label_space self.max_label_space=max_label_space
self.featureweight = featureweight self.featureweight = featureweight
self.sif = sif
def fit(self, lX, ly, lV=None): def fit(self, lX, ly, lV=None):
self.langs = sorted(lX.keys()) self.langs = sorted(lX.keys())
@ -184,7 +192,7 @@ class WordClassEmbedder:
lWCE = self.lWCE lWCE = self.lWCE
lX = self.featureweight.transform(lX) lX = self.featureweight.transform(lX)
XdotWCE = Parallel(n_jobs=self.n_jobs)( XdotWCE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], lWCE[lang])for lang in self.langs delayed(XdotM)(lX[lang], lWCE[lang], self.sif)for lang in self.langs
) )
lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)} lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
lwce = _normalize(lwce, self.l2) lwce = _normalize(lwce, self.l2)
@ -193,6 +201,9 @@ class WordClassEmbedder:
def fit_transform(self, lX, ly, lV=None): def fit_transform(self, lX, ly, lV=None):
return self.fit(lX, ly).transform(lX) return self.fit(lX, ly).transform(lX)
def _get_output_dim(self):
return 73
class DocEmbedderList: class DocEmbedderList:
@ -201,6 +212,7 @@ class DocEmbedderList:
if len(embedder_list)==0: embedder_list=[] if len(embedder_list)==0: embedder_list=[]
self.embedders = embedder_list self.embedders = embedder_list
self.aggregation = aggregation self.aggregation = aggregation
print(f'Aggregation mode: {self.aggregation}')
def fit(self, lX, ly, lV=None): def fit(self, lX, ly, lV=None):
for transformer in self.embedders: for transformer in self.embedders:
@ -238,16 +250,25 @@ class DocEmbedderList:
langs = sorted(lX.keys()) langs = sorted(lX.keys())
lZparts = {l: None for l in langs} lZparts = {l: None for l in langs}
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
min_dim = 300
for transformer in self.embedders: for transformer in self.embedders:
lZ = transformer.transform(lX) lZ = transformer.transform(lX)
nC = min([lZ[lang].shape[1] for lang in langs])
for l in langs: for l in langs:
Z = lZ[l] Z = lZ[l]
if Z.shape[1] > min_dim:
print(f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
f'Applying PCA(n_components={min_dim})')
pca = PCA(n_components=min_dim)
Z = pca.fit(Z).transform(Z)
if lZparts[l] is None: if lZparts[l] is None:
lZparts[l] = Z lZparts[l] = Z
else: else:
lZparts[l] += Z lZparts[l] += Z
n_transformers = len(self.embedders) n_transformers = len(self.embedders)
nC = min([lZparts[lang].shape[1] for lang in langs])
return {l:lZparts[l] / n_transformers for l in langs} return {l:lZparts[l] / n_transformers for l in langs}
@ -266,7 +287,7 @@ class FeatureSet2Posteriors:
self.transformer = transformer self.transformer = transformer
self.l2=l2 self.l2=l2
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.prob_classifier = MetaClassifier(SVC(kernel='rbf', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) self.prob_classifier = MetaClassifier(SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
def fit(self, lX, ly, lV=None): def fit(self, lX, ly, lV=None):
if lV is None and hasattr(self.transformer, 'lV'): if lV is None and hasattr(self.transformer, 'lV'):
@ -412,11 +433,13 @@ def word_class_embedding_matrix(X, Y, max_label_space=300):
return WCE return WCE
def XdotM(X,M): def XdotM(X,M, sif):
# return X.dot(M) # return X.dot(M)
# print(f'X={X.shape}, M={M.shape}') print(f'X={X.shape}, M={M.shape}')
E = X.dot(M) E = X.dot(M)
E = remove_pc(E, npc=1) if sif:
print("removing pc...")
E = remove_pc(E, npc=1)
return E return E

View File

@ -1,92 +0,0 @@
from optparse import OptionParser
from util.results import PolylingualClassificationResults
from dataset_builder import MultilingualDataset
from keras.preprocessing.text import Tokenizer
from learning.learners import MonolingualNetSvm
from sklearn.svm import SVC
import pickle
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format",
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimize hyperparameters", default=False)
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
(op, args) = parser.parse_args()
###################################################################################################################
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
def get_params(dense=False):
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf' if dense else 'linear'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
# PREPROCESS TEXT AND SAVE IT ... both for SVM and NN
def preprocess_data(lXtr, lXte, lytr, lyte):
tokenized_tr = dict()
tokenized_te = dict()
for lang in lXtr.keys():
alltexts = ' '.join(lXtr[lang])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(alltexts.split(' '))
tokenizer.oov_token = len(tokenizer.word_index)+1
# dumping train set
sequences_tr = tokenizer.texts_to_sequences(lXtr[lang])
tokenized_tr[lang] = (tokenizer.word_index, sequences_tr, lytr[lang])
# dumping test set
sequences_te = tokenizer.texts_to_sequences(lXte[lang])
tokenized_te[lang] = (tokenizer.word_index, sequences_te, lyte[lang])
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'wb') as f:
pickle.dump(tokenized_tr, f)
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_test.pickle', 'wb') as f:
pickle.dump(tokenized_tr, f)
print('Successfully dumped data')
# def load_preprocessed():
# with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
# return pickle.load(f)
#
# def build_embedding_matrix(lang, word_index):
# type = 'MUSE'
# path = '/home/andreapdr/CLESA/'
# MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
# return MUSE
########## MAIN #################################################################################################
if __name__ == '__main__':
results = PolylingualClassificationResults('./results/NN_FPEC_results.csv')
data = MultilingualDataset.load(op.dataset)
lXtr, lytr = data.training()
lXte, lyte = data.test()
if op.set_c != -1:
meta_parameters = None
else:
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
test_architecture = MonolingualNetSvm(lXtr,
lytr,
first_tier_learner=get_learner(calibrate=True),
first_tier_parameters=None,
n_jobs=1)
test_architecture.fit()

View File

@ -1,6 +1,6 @@
import argparse import argparse
import torch.nn as nn import torch.nn as nn
from torch.optim.lr_scheduler import StepLR from torch.optim.lr_scheduler import StepLR, MultiStepLR
from dataset_builder import MultilingualDataset from dataset_builder import MultilingualDataset
from learning.transformers import load_muse_embeddings from learning.transformers import load_muse_embeddings
from models.lstm_class import RNNMultilingualClassifier from models.lstm_class import RNNMultilingualClassifier
@ -9,8 +9,6 @@ from util.early_stop import EarlyStopping
from util.common import * from util.common import *
from util.file import create_if_not_exist from util.file import create_if_not_exist
from time import time from time import time
from embeddings.pretrained import *
from os.path import join
from tqdm import tqdm from tqdm import tqdm
from util.evaluation import evaluate from util.evaluation import evaluate
from util.file import get_file_name from util.file import get_file_name
@ -100,7 +98,7 @@ def main():
# Loading the dataset # Loading the dataset
data = MultilingualDataset.load(opt.dataset) data = MultilingualDataset.load(opt.dataset)
# data.set_view(languages=['de', 'fr', 'sv', 'da', 'es', 'it']) data.set_view(languages=['de', 'fr']) #, 'it', 'en']) # 'sv', 'da', 'es', 'it'])
data.show_dimensions() data.show_dimensions()
langs = data.langs() langs = data.langs()
l_devel_raw, l_devel_target = data.training(target_as_csr=True) l_devel_raw, l_devel_target = data.training(target_as_csr=True)
@ -108,6 +106,7 @@ def main():
# Loading the MUSE pretrained embeddings (only if requested) # Loading the MUSE pretrained embeddings (only if requested)
lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs) lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
# lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set
# Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs # Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
multilingual_index = MultilingualIndex() multilingual_index = MultilingualIndex()
@ -115,10 +114,26 @@ def main():
multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed) multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
multilingual_index.embedding_matrices(lpretrained, opt.supervised) multilingual_index.embedding_matrices(lpretrained, opt.supervised)
if opt.posteriors: if opt.posteriors:
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=opt.svm_max_docs) lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000, store_posteriors=True) #stored_post=True) #opt.svm_max_docs)
else: else:
lPtr, lPva, lPte = None, None, None lPtr, lPva, lPte = None, None, None
# just_test = False
# if just_test:
#
# model = torch.load(
# '../checkpoint/rnn(H512)-Muse-WCE-Posteriors-(trainable)-jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')
# criterion = torch.nn.BCEWithLogitsLoss().cuda()
#
# # batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
#
# batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
# l_test_index = multilingual_index.l_test_index()
# epoch = 1
# tinit = time()
# test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
# exit('Loaded')
# Model initialization # Model initialization
model = init_Net(data.num_categories(), multilingual_index) model = init_Net(data.num_categories(), multilingual_index)
@ -130,7 +145,7 @@ def main():
tinit = time() tinit = time()
create_if_not_exist(opt.checkpoint_dir) create_if_not_exist(opt.checkpoint_dir)
early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}') early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
l_train_index, l_train_target = multilingual_index.l_train() l_train_index, l_train_target = multilingual_index.l_train()
l_val_index, l_val_target = multilingual_index.l_val() l_val_index, l_val_target = multilingual_index.l_val()
@ -155,7 +170,6 @@ def main():
break break
# training is over # training is over
# restores the best model according to the Mf1 of the validation set (only when plotmode==False) # restores the best model according to the Mf1 of the validation set (only when plotmode==False)
# stoptime = early_stop.stop_time - tinit # stoptime = early_stop.stop_time - tinit
# stopepoch = early_stop.best_epoch # stopepoch = early_stop.best_epoch
@ -164,6 +178,8 @@ def main():
if opt.plotmode==False: if opt.plotmode==False:
print('-' * 80) print('-' * 80)
print('Training over. Performing final evaluation') print('Training over. Performing final evaluation')
# torch.cuda.empty_cache()
model = early_stop.restore_checkpoint() model = early_stop.restore_checkpoint()
if opt.val_epochs>0: if opt.val_epochs>0:
@ -183,10 +199,14 @@ def get_lr(optimizer):
def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name): def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name):
_dataset_path = opt.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
loss_history = [] loss_history = []
model.train() model.train()
for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)): for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)):
optim.zero_grad() optim.zero_grad()
_out = model(batch,post, lang)
loss = criterion(model(batch, post, lang), target) loss = criterion(model(batch, post, lang), target)
loss.backward() loss.backward()
clip_gradient(model) clip_gradient(model)
@ -195,7 +215,7 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,
if idx % opt.log_interval == 0: if idx % opt.log_interval == 0:
interval_loss = np.mean(loss_history[-opt.log_interval:]) interval_loss = np.mean(loss_history[-opt.log_interval:])
print(f'{opt.dataset} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss) mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
@ -203,6 +223,8 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,
def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix): def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix):
loss_history = []
model.eval() model.eval()
langs = sorted(ltest_index.keys()) langs = sorted(ltest_index.keys())
predictions = {l:[] for l in langs} predictions = {l:[] for l in langs}
@ -214,6 +236,7 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
prediction = predict(logits) prediction = predict(logits)
predictions[lang].append(prediction) predictions[lang].append(prediction)
yte_stacked[lang].append(target.detach().cpu().numpy()) yte_stacked[lang].append(target.detach().cpu().numpy())
loss_history.append(loss)
ly = {l:np.vstack(yte_stacked[l]) for l in langs} ly = {l:np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l:np.vstack(predictions[l]) for l in langs} ly_ = {l:np.vstack(predictions[l]) for l in langs}
@ -224,17 +247,15 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
metrics.append([macrof1, microf1, macrok, microk]) metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix=='te': if measure_prefix=='te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
# (config['max_label_space'], classifier.best_components),
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
# lang, macrof1, microf1, macrok, microk, '')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=tend) mean_loss = np.mean(loss_history)
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mf1, timelapse=tend) logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-accuracy', value=acc, timelapse=tend) logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=loss, timelapse=tend) logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
return Mf1 return Mf1

View File

@ -1,7 +1,7 @@
import os import os
from dataset_builder import MultilingualDataset from dataset_builder import MultilingualDataset
# from learning.learners import * # from learning.learners import *
from learning.learners import FunnellingMultimodal # from learning.learners import FunnellingMultimodal
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \ from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting
from util.evaluation import * from util.evaluation import *
@ -14,14 +14,14 @@ from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
parser = OptionParser() parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset", # parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format", # help="Path to the multilingual dataset processed and stored in .pickle format",
default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") # default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-o", "--output", dest="output", parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv') help="Result file", type=str, default='./results/results.csv')
parser.add_option("-P", "--probs", dest="probs", action='store_true', parser.add_option("-P", "--probs", dest="posteriors", action='store_true',
help="Add posterior probabilities to the document embedding representation", default=False) help="Add posterior probabilities to the document embedding representation", default=False)
parser.add_option("-S", "--supervised", dest="supervised", action='store_true', parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
@ -46,6 +46,9 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
default=300) default=300)
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
help="Remove common component when computing dot product of word embedding matrices", default=False)
# parser.add_option("-u", "--upca", dest="max_labels_U", type=int, # parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." # help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
# " If set to 0 it will automatically search for the best number of components", default=300) # " If set to 0 it will automatically search for the best number of components", default=300)
@ -72,15 +75,18 @@ def get_params(dense=False):
if __name__ == '__main__': if __name__ == '__main__':
(op, args) = parser.parse_args() (op, args) = parser.parse_args()
assert exists(op.dataset), 'Unable to find file '+str(op.dataset) assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' dataset = args[0]
assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
dataset_file = os.path.basename(op.dataset) assert exists(dataset), 'Unable to find file '+str(dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
dataset_file = os.path.basename(dataset)
results = PolylingualClassificationResults(op.output) results = PolylingualClassificationResults(op.output)
data = MultilingualDataset.load(op.dataset) data = MultilingualDataset.load(dataset)
data.show_dimensions() data.show_dimensions()
lXtr, lytr = data.training() lXtr, lytr = data.training()
@ -88,8 +94,9 @@ if __name__ == '__main__':
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}' # result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \
f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}'
print(f'{result_id}') print(f'{result_id}')
# text preprocessing # text preprocessing
@ -100,7 +107,7 @@ if __name__ == '__main__':
lV = tfidfvectorizer.vocabulary() lV = tfidfvectorizer.vocabulary()
classifiers = [] classifiers = []
if op.probs: if op.posteriors:
classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None)) classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
if op.supervised: if op.supervised:
classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S))) classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S)))
@ -115,13 +122,37 @@ if __name__ == '__main__':
print('\n# Evaluating ...') print('\n# Evaluating ...')
l_eval = evaluate_method(classifier, lXte, lyte) l_eval = evaluate_method(classifier, lXte, lyte)
# renaming arguments to be printed on log
_id = ''
_id_conf = [op.posteriors, op.supervised, op.pretrained]
_id_name = ['+P', '+W', '+M']
for i, conf in enumerate(_id_conf):
if conf:
_id += _id_name[i]
_id = _id.lstrip('+')
_dataset_path = dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
metrics = [] metrics = []
for lang in lXte.keys(): for lang in lXte.keys():
macrof1, microf1, macrok, microk = l_eval[lang] macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk]) metrics.append([macrof1, microf1, macrok, microk])
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], results.add_row(method='Voting',
# (config['max_label_space'], classifier.best_components), learner='svm',
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, optimp=op.optimc,
# lang, macrof1, microf1, macrok, microk, '') sif=op.sif,
zscore='todo',
l2='todo',
wescaler='todo',
pca=op.max_labels_S,
id=_id,
dataset=dataset_id,
time='todo',
lang=lang,
macrof1=macrof1,
microf1=microf1,
macrok=macrok,
microk=microk,
notes='')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

View File

@ -11,7 +11,7 @@ from sklearn.svm import SVC
parser = OptionParser(usage="usage: %prog datapath [options]") parser = OptionParser(usage="usage: %prog datapath [options]")
parser.add_option("-o", "--output", dest="output", parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv') help="Result file", type=str, default='multiModal_log.csv')
parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true', parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true',
help="Add posterior probabilities to the document embedding representation", default=False) help="Add posterior probabilities to the document embedding representation", default=False)
@ -22,8 +22,8 @@ parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true', parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
help="Add pretrained MUSE embeddings to the document embedding representation", default=False) help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
parser.add_option("--nol2", dest="nol2", action='store_true', parser.add_option("--l2", dest="l2", action='store_true',
help="Deactivates l2 normalization as a post-processing for the document embedding views", default=False) help="Activates l2 normalization as a post-processing for the document embedding views", default=False)
parser.add_option("--allprob", dest="allprob", action='store_true', parser.add_option("--allprob", dest="allprob", action='store_true',
help="All views are generated as posterior probabilities. This affects the supervised and pretrained " help="All views are generated as posterior probabilities. This affects the supervised and pretrained "
@ -48,11 +48,28 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
default=300) default=300)
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
help="Remove common component when computing dot product of word embedding matrices", default=False)
parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
help="Z-score normalize matrices (WCE and MUSE)", default=False)
parser.add_option("-a", "--agg", dest="agg", action='store_true',
help="Set aggregation function of the common Z-space to average (Default: concatenation)", default=False)
def get_learner(calibrate=False, kernel='linear'): def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto') return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
def get_params():
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
####################################################################################################################### #######################################################################################################################
@ -64,17 +81,23 @@ if __name__ == '__main__':
assert exists(dataset), 'Unable to find file '+str(dataset) assert exists(dataset), 'Unable to find file '+str(dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed' assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
l2=(op.nol2==False) l2=op.l2
dataset_file = os.path.basename(dataset) dataset_file = os.path.basename(dataset)
results = PolylingualClassificationResults(op.output) results = PolylingualClassificationResults('../log/' + op.output)
allprob='Prob' if op.allprob else '' allprob='Prob' if op.allprob else ''
result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \ result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \
f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}{"_optimC" if op.optimc else ""}' f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}_zscore={op.zscore}{"_optimC" if op.optimc else ""}'
print(f'{result_id}') print(f'{result_id}')
# set zscore range - is slice(0,0) mean will be equal to 0 and std to 1, thus normalization will have no effect
standardize_range = slice(0,0)
if op.zscore:
standardize_range = None
data = MultilingualDataset.load(dataset) data = MultilingualDataset.load(dataset)
# data.set_view(languages=['fr', 'it'])
data.show_dimensions() data.show_dimensions()
lXtr, lytr = data.training() lXtr, lytr = data.training()
lXte, lyte = data.test() lXte, lyte = data.test()
@ -86,23 +109,23 @@ if __name__ == '__main__':
feat_weighting = FeatureWeight(op.feat_weight, agg='mean') feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
# # document embedding modules # # document embedding modules
doc_embedder = DocEmbedderList(aggregation='concat') doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat')
if op.posteriors: if op.posteriors:
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2)) doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2))
if op.supervised: if op.supervised:
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting) wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
if op.allprob: if op.allprob:
wce = FeatureSet2Posteriors(wce, l2=l2) wce = FeatureSet2Posteriors(wce, l2=l2)
doc_embedder.append(wce) doc_embedder.append(wce)
if op.pretrained: if op.pretrained:
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting) muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
if op.allprob: if op.allprob:
muse = FeatureSet2Posteriors(muse, l2=l2) muse = FeatureSet2Posteriors(muse, l2=l2)
doc_embedder.append(muse) doc_embedder.append(muse)
# metaclassifier # metaclassifier
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=meta_parameters) meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=get_params(), standardize_range=standardize_range)
# ensembling the modules # ensembling the modules
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
@ -113,13 +136,40 @@ if __name__ == '__main__':
print('\n# Evaluating ...') print('\n# Evaluating ...')
l_eval = evaluate_method(classifier, lXte, lyte) l_eval = evaluate_method(classifier, lXte, lyte)
# renaming arguments to be printed on log
_id = ''
_id_conf = [op.posteriors, op.supervised, op.pretrained]
_id_name = ['+P', '+W', '+M']
for i, conf in enumerate(_id_conf):
if conf:
_id += _id_name[i]
_id = _id.lstrip('+')
_id = _id if not op.agg else _id + '_mean'
_id = _id if not op.allprob else _id + '_allprob'
_dataset_path = dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
metrics = [] metrics = []
for lang in lXte.keys(): for lang in lXte.keys():
macrof1, microf1, macrok, microk = l_eval[lang] macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk]) metrics.append([macrof1, microf1, macrok, microk])
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], results.add_row(method='MultiModal',
# (config['max_label_space'], classifier.best_components), learner='svm',
# config['dim_reduction_unsupervised'], op.optimc, dataset.split('/')[-1], classifier.time, optimp=op.optimc,
# lang, macrof1, microf1, macrok, microk, '') sif= op.sif,
zscore=op.zscore,
l2= op.l2,
wescaler= op.feat_weight,
pca=op.max_labels_S,
id=_id,
dataset=dataset_id,
time='todo',
lang=lang,
macrof1=macrof1,
microf1=microf1,
macrok=macrok,
microk=microk,
notes='')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

View File

@ -27,7 +27,7 @@ class RNNMultilingualClassifier(nn.Module):
self.n_layers = 1 self.n_layers = 1
self.n_directions = 1 self.n_directions = 1
self.dropout = nn.Dropout(0.2) self.dropout = nn.Dropout(0.6)
lstm_out = 256 lstm_out = 256
ff1 = 512 ff1 = 512
@ -45,7 +45,7 @@ class RNNMultilingualClassifier(nn.Module):
llearnable_embeddings[l] = learnable_embeddings llearnable_embeddings[l] = learnable_embeddings
self.embedding_length = embedding_length self.embedding_length = embedding_length
# self.rnn = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2)) # self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
self.rnn = nn.GRU(self.embedding_length, hidden_size) self.rnn = nn.GRU(self.embedding_length, hidden_size)
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
self.lpretrained_embeddings.update(lpretrained_embeddings) self.lpretrained_embeddings.update(lpretrained_embeddings)

355
src/new_mbert.py Normal file
View File

@ -0,0 +1,355 @@
"""
Test with smaller subset of languages.
1. Load doc (RCV1/2)
2. Tokenize texts via bertTokenizer (I should already have these dumps)
3. Construct better Dataloader/Datasets. NB: I need to keep track of the languages only for
the testing phase (but who cares actually? If I have to do it for the testing phase, I think
it is better to deploy it also in the training phase...)
4. ...
5. I have to understand if the pooled hidden state of the last layer is way worse than its averaged
version (However, in BertForSeqClassification I guess that the pooled version is passed through
the output linear layer in order to get the prediction scores?)
6. At the same time, I have to build also an end-to-end model in order to fine-tune it. The previous step
would be useful when deploying mBert as a View Generator. (Refactor gFun code with view generators?)
7. ...
8. Profits
"""
from dataset_builder import MultilingualDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
from util.common import clip_gradient, predict
from time import time
from util.csv_log import CSVLog
from util.evaluation import evaluate
from util.early_stop import EarlyStopping
from torch.optim.lr_scheduler import StepLR
from sklearn.model_selection import train_test_split
import argparse
def get_model(n_out):
print('# Initializing model ...')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
return model
def set_method_name():
return 'mBERT'
def init_optimizer(model, lr):
# return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)],
'weight_decay': opt.weight_decay},
{'params': [p for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)],
'weight_decay': opt.weight_decay}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
return optimizer
def init_logfile(method_name, opt):
logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
logfile.set_default('dataset', opt.dataset)
logfile.set_default('run', opt.seed)
logfile.set_default('method', method_name)
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated'
return logfile
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def get_dataset_name(datapath):
possible_splits = [str(i) for i in range(10)]
splitted = datapath.split('_')
id_split = splitted[-1].split('.')[0][-1]
if id_split in possible_splits:
dataset_name = splitted[0].split('/')[-1]
return f'{dataset_name}_run{id_split}'
def load_datasets(datapath):
data = MultilingualDataset.load(datapath)
data.set_view(languages=['nl']) # Testing with just two langs
data.show_dimensions()
l_devel_raw, l_devel_target = data.training(target_as_csr=False)
l_test_raw, l_test_target = data.test(target_as_csr=False)
return l_devel_raw, l_devel_target, l_test_raw, l_test_target
def do_tokenization(l_dataset, max_len=512):
print('# Starting Tokenization ...')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
langs = l_dataset.keys()
l_tokenized = {}
for lang in langs:
l_tokenized[lang] = tokenizer(l_dataset[lang],
truncation=True,
max_length=max_len,
add_special_tokens=True,
padding='max_length')
return l_tokenized
class TrainingDataset(Dataset):
"""
data: dict of lang specific tokenized data
labels: dict of lang specific targets
"""
def __init__(self, data, labels):
self.langs = data.keys()
self.lang_ids = {lang:identifier for identifier, lang in enumerate(self.langs)}
for i, lang in enumerate(self.langs):
# print(lang)
_data = data[lang]['input_ids']
_data = np.array(_data)
_labels = labels[lang]
_lang_value = np.full(len(_data), self.lang_ids[lang])
if i == 0:
self.data = _data
self.labels = _labels
self.lang_index = _lang_value
else:
self.data = np.vstack((self.data, _data))
self.labels = np.vstack((self.labels, _labels))
self.lang_index = np.concatenate((self.lang_index, _lang_value))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.data[idx]
y = self.labels[idx]
lang = self.lang_index[idx]
return x, torch.tensor(y, dtype=torch.float), lang
# return x, y, lang
def get_lang_ids(self):
return self.lang_ids
def freeze_encoder(model):
for param in model.base_model.parameters():
param.requires_grad = False
return model
def check_param_grad_status(model):
print('#'*50)
print('Model paramater status')
for name, child in model.named_children():
trainable = False
for param in child.parameters():
if param.requires_grad:
trainable = True
if not trainable:
print(f'{name} is frozen')
else:
print(f'{name} is not frozen')
print('#'*50)
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile):
_dataset_path = opt.dataset.split('/')[-1].split('_')
# dataset_id = 'RCV1/2_run0_newBert'
dataset_id = _dataset_path[0] + _dataset_path[-1]
loss_history = []
model.train()
for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
# optim.zero_grad()
out = model(batch.cuda())
loss = criterion(out[0], target.cuda())
loss.backward()
clip_gradient(model)
optim.step()
loss_history.append(loss.item())
if idx % opt.log_interval == 0:
interval_loss = np.mean(loss_history[-opt.log_interval:])
print(
f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
return mean_loss
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix):
print('# Validating model ...')
loss_history = []
model.eval()
langs = lang_ids.keys()
id_2_lang = {v:k for k,v in lang_ids.items()}
predictions = {l: [] for l in langs}
yte_stacked = {l: [] for l in langs}
for batch, target, lang_idx in test_dataloader:
out = model(batch.cuda())
logits = out[0]
loss = criterion(logits, target.cuda()).item()
prediction = predict(logits)
loss_history.append(loss)
# Assigning prediction to dict in predictionS and yte_stacked according to lang_idx
for i, pred in enumerate(prediction):
lang_pred = id_2_lang[lang_idx.numpy()[i]]
predictions[lang_pred].append(pred)
yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
ly = {l: np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l: np.vstack(predictions[l]) for l in langs}
l_eval = evaluate(ly, ly_)
metrics = []
for lang in langs:
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix == 'te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
mean_loss = np.mean(loss_history)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
return Mf1
def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
l_split_va = l_tokenized_tr
l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
l_split_tr = l_tokenized_tr
l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
for lang in l_tokenized_tr.keys():
val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[lang] = \
train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, random_state=seed, shuffle=True)
return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
def main():
print('Running main ...')
DATAPATH = opt.dataset
method_name = set_method_name()
logfile = init_logfile(method_name, opt)
l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
l_tokenized_tr = do_tokenization(l_devel_raw, max_len=512)
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop=0.2, max_val=2000, seed=opt.seed)
l_tokenized_te = do_tokenization(l_test_raw, max_len=512)
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=False)
te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False)
# Initializing model
model = get_model(73)
model = model.cuda()
criterion = torch.nn.BCEWithLogitsLoss().cuda()
optim = init_optimizer(model, lr=opt.lr)
# lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_dataset_name(opt.dataset)}')
# lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optim, num_warmup_steps= , num_training_steps=)
# print(model)
# Freezing encoder
# model = freeze_encoder(model)
check_param_grad_status(model)
# Training loop
tinit = time()
lang_ids = va_dataset.lang_ids
for epoch in range(1, opt.nepochs+1):
print('# Start Training ...')
train(model, tr_dataloader, epoch, criterion, optim, 'TestingBert', tinit, logfile)
# lr_scheduler.step(epoch=None) # reduces the learning rate
# validation
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
early_stop(macrof1, epoch)
if opt.test_each>0:
if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
if early_stop.STOP:
print('[early-stop] STOP')
if not opt.plotmode:
break
if opt.plotmode==False:
print('-' * 80)
print('Training over. Performing final evaluation')
model = early_stop.restore_checkpoint()
if opt.val_epochs>0:
print(f'running last {opt.val_epochs} training epochs on the validation set')
for val_epoch in range(1, opt.val_epochs + 1):
train(model, va_dataloader, epoch+val_epoch, criterion, optim, 'TestingBert', tinit, logfile)
# final test
print('Training complete: testing')
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
exit('Code Executed!')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model')
parser.add_argument('--dataset', type=str, default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
metavar='datasetpath', help=f'path to the pickled dataset')
parser.add_argument('--nepochs', type=int, default=200, metavar='int',
help='number of epochs (default: 200)')
parser.add_argument('--lr', type=float, default=2e-5, metavar='float',
help='learning rate (default: 2e-5)')
parser.add_argument('--weight_decay', type=float, default=0, metavar='float',
help='weight decay (default: 0)')
parser.add_argument('--patience', type=int, default=10, metavar='int',
help='patience for early-stop (default: 10)')
parser.add_argument('--log-interval', type=int, default=20, metavar='int',
help='how many batches to wait before printing training status')
parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str',
help='path to the log csv file')
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
parser.add_argument('--force', action='store_true', default=False,
help='do not check if this experiment has already been run')
parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str',
help='path to the directory containing checkpoints')
parser.add_argument('--plotmode', action='store_true', default=False,
help='in plot mode executes a long run in order '
'to generate enough data to produce trend plots (test-each should be >0. This mode is '
'used to produce plots, and does not perform an evaluation on the test set.')
parser.add_argument('--test-each', type=int, default=0, metavar='int',
help='how many epochs to wait before invoking test (default: 0, only at the end)')
parser.add_argument('--val-epochs', type=int, default=1, metavar='int',
help='number of training epochs to perform on the validation set once training is over (default 1)')
opt = parser.parse_args()
# Testing different parameters ...
opt.weight_decay = 0.01
opt.patience = 5
main()
# TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size

View File

@ -1,7 +1,11 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t') # df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t')
pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'embed'], aggfunc=[np.mean, np.std]) df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t')
print(pivot) pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std])
print('Finished ...') with pd.option_context('display.max_rows', None):
print(pivot.round(3))
print('Finished ...')

11
src/run_mbert_rcv.sh Normal file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
logfile=../log/log_Mbert_rcv.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python new_mbert.py --dataset $dataset --log-file $logfile --test-each 20
done

View File

@ -17,7 +17,7 @@ def get_weighted_average(We, x, w):
def compute_pc(X,npc=1): def compute_pc(X,npc=1):
""" """
Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN! Compute the principal components.
:param X: X[i,:] is a data point :param X: X[i,:] is a data point
:param npc: number of principal components to remove :param npc: number of principal components to remove
:return: component_[i,:] is the i-th pc :return: component_[i,:] is the i-th pc

View File

@ -1,4 +1,5 @@
import warnings import warnings
import time
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
@ -143,6 +144,15 @@ class Index:
embedding_parts.append(F) embedding_parts.append(F)
make_dumps = False
if make_dumps:
print(f'Dumping Embedding Matrices ...')
import pickle
with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile:
pickle.dump((self.lang, embedding_parts, self.word2index), outfile)
with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2:
pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2)
self.embedding_matrix = torch.cat(embedding_parts, dim=1) self.embedding_matrix = torch.cat(embedding_parts, dim=1)
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]') print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
@ -155,6 +165,7 @@ class MultilingualIndex:
def __init__(self): #, add_language_trace=False): def __init__(self): #, add_language_trace=False):
self.l_index = {} self.l_index = {}
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
# self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000)
# self.add_language_trace=add_language_trace # self.add_language_trace=add_language_trace
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary): def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
@ -189,30 +200,42 @@ class MultilingualIndex:
# pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1) # pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1)
def posterior_probabilities(self, max_training_docs_by_lang=5000): def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
# choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
timeit = time.time()
lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()} lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()} lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
for l in self.langs: if not stored_post:
n_elements = lXtr[l].shape[0] for l in self.langs:
if n_elements > max_training_docs_by_lang: n_elements = lXtr[l].shape[0]
choice = np.random.permutation(n_elements)[:max_training_docs_by_lang] if n_elements > max_training_docs_by_lang:
lXtr[l] = lXtr[l][choice] choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
lYtr[l] = lYtr[l][choice] lXtr[l] = lXtr[l][choice]
lYtr[l] = lYtr[l][choice]
# train the posterior probabilities embedder # train the posterior probabilities embedder
print('[posteriors] training a calibrated SVM') print('[posteriors] training a calibrated SVM')
learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto') learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False) prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
prob_embedder.fit(lXtr, lYtr) prob_embedder.fit(lXtr, lYtr)
# transforms the training, validation, and test sets into posterior probabilities # transforms the training, validation, and test sets into posterior probabilities
print('[posteriors] generating posterior probabilities') print('[posteriors] generating posterior probabilities')
lPtr = prob_embedder.transform(self.get_lXtr()) lPtr = prob_embedder.transform(self.get_lXtr())
lPva = prob_embedder.transform(self.get_lXva()) lPva = prob_embedder.transform(self.get_lXva())
lPte = prob_embedder.transform(self.get_lXte()) lPte = prob_embedder.transform(self.get_lXte())
# NB: Check splits indices !
print('[posteriors] done') if store_posteriors:
import pickle
with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
pickle.dump([lPtr, lPva, lPte], outfile)
print(f'Successfully dumped posteriors!')
else:
import pickle
with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
lPtr, lPva, lPte = pickle.load(infile)
print(f'Successfully loaded stored posteriors!')
print(f'[posteriors] done in {time.time() - timeit}')
return lPtr, lPva, lPte return lPtr, lPva, lPte
def get_lXtr(self): def get_lXtr(self):

View File

@ -6,7 +6,7 @@ from util.file import create_if_not_exist
class EarlyStopping: class EarlyStopping:
def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'): def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters # set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
self.patience_limit = patience self.patience_limit = patience
self.patience = patience self.patience = patience
@ -16,9 +16,10 @@ class EarlyStopping:
self.stop_time = None self.stop_time = None
self.checkpoint = checkpoint self.checkpoint = checkpoint
self.model = model self.model = model
self.optimizer = optimizer
self.STOP = False self.STOP = False
def __call__(self, watch_score, epoch): def __call__(self, watch_score, epoch): #model
if self.STOP: return #done if self.STOP: return #done
@ -29,6 +30,9 @@ class EarlyStopping:
if self.checkpoint: if self.checkpoint:
self.print(f'[early-stop] improved, saving model in {self.checkpoint}') self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
torch.save(self.model, self.checkpoint) torch.save(self.model, self.checkpoint)
# with open(self.checkpoint)
# torch.save({'state_dict': self.model.state_dict(),
# 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint)
else: else:
self.print(f'[early-stop] improved') self.print(f'[early-stop] improved')
self.patience = self.patience_limit self.patience = self.patience_limit
@ -46,6 +50,7 @@ class EarlyStopping:
self.patience=self.patience_limit self.patience=self.patience_limit
def restore_checkpoint(self): def restore_checkpoint(self):
print(f'restoring best model from epoch {self.best_epoch}...')
return torch.load(self.checkpoint) return torch.load(self.checkpoint)
def print(self, msg): def print(self, msg):

View File

@ -5,8 +5,23 @@ import numpy as np
class PolylingualClassificationResults: class PolylingualClassificationResults:
def __init__(self, file, autoflush=True, verbose=False): def __init__(self, file, autoflush=True, verbose=False):
self.file = file self.file = file
self.columns = ['method', 'learner', 'embed', 'embed_type', 'pca_s', 'pca_u', 'optimp', 'dataset', 'time', self.columns = ['method',
'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] 'learner',
'optimp',
'sif',
'zscore',
'l2',
'wescaler',
'pca',
'id',
'dataset',
'time',
'lang',
'macrof1',
'microf1',
'macrok',
'microk',
'notes']
self.autoflush = autoflush self.autoflush = autoflush
self.verbose = verbose self.verbose = verbose
if os.path.exists(file): if os.path.exists(file):
@ -21,8 +36,8 @@ class PolylingualClassificationResults:
def already_calculated(self, id): def already_calculated(self, id):
return (self.df['id'] == id).any() return (self.df['id'] == id).any()
def add_row(self, method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
s = pd.Series([method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
self.df = self.df.append(s, ignore_index=True) self.df = self.df.append(s, ignore_index=True)
if self.autoflush: self.flush() if self.autoflush: self.flush()
self.tell(s.to_string()) self.tell(s.to_string())