baseline multilingual Bert

This commit is contained in:
andrea 2020-07-27 11:56:09 +02:00
parent 22b7ea7e66
commit d1fdad5f6e
37 changed files with 1212 additions and 1112 deletions

View File

@ -1,10 +1,7 @@
import os
import pickle
from torchtext.vocab import Vectors
import torch
from abc import ABC, abstractmethod
from embeddings.supervised import get_supervised_embeddings
from util.decompositions import *
from util.SIF_embed import *
@ -35,122 +32,10 @@ class PretrainedEmbeddings(ABC):
return source_idx, target_idx
class WordEmbeddings:
def __init__(self, lang, we, worddim):
self.lang = lang
self.we = we
self.worddim = worddim
self.dimword = {v:k for k,v in self.worddim.items()}
@classmethod
def load(cls, basedir, lang, word_preprocessor=None, dopickle=True):
filename = 'wiki.multi.{}.vec'.format(lang)
we_path = os.path.join(basedir, filename)
if dopickle and os.path.exists(we_path + '.pkl'):
print('loading pkl in {}'.format(we_path + '.pkl'))
(worddim, we) = pickle.load(open(we_path + '.pkl', 'rb'))
else:
word_registry = set()
lines = open(we_path).readlines()
nwords, dims = [int(x) for x in lines[0].split()]
print('reading we of {} dimensions'.format(dims))
we = np.zeros((nwords, dims), dtype=float)
worddim = {}
index = 0
for i, line in enumerate(lines[1:]):
if (i + 1) % 100 == 0:
print('\r{}/{}'.format(i + 1, len(lines)), end='')
word, *vals = line.split()
wordp = word_preprocessor(word) if word_preprocessor is not None else word
if wordp:
wordp = wordp[0]
if wordp in word_registry:
print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp))
elif len(vals) == dims:
worddim[wordp] = index
we[index, :] = np.array(vals).astype(float)
index += 1
# else:
# print('warning: word <{}> generates an empty string after preprocessing'.format(word))
we = we[:index]
print('load {} words'.format(index))
if dopickle:
print('saving...')
pickle.dump((worddim, we), open(we_path + '.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
return WordEmbeddings(lang, we, worddim)
def vocabulary(self):
return set(self.worddim.keys())
def __getitem__(self, key):
return self.we[self.worddim[key]]
def dim(self):
return self.we.shape[1]
def __contains__(self, key):
return key in self.worddim
def most_similar(self, word_vect, k):
if word_vect.ndim == 1:
word_vect = word_vect.reshape(1,-1)
assert word_vect.shape[1] == self.dim(), 'inconsistent dimensions'
sim = np.dot(word_vect,self.we.T)
order = np.argsort(-1*sim, axis=1)[:,:k]
similar_words = [[self.dimword[order[vi,ki]] for ki in range(k)] for vi in range(word_vect.shape[0])]
sim_scores = sim[:,order]
return similar_words, sim_scores
def get_vectors(self, wordlist):
indexes = np.array([self.worddim[w] for w in wordlist])
return self.we[indexes]
def restrict(self, vocabulary):
# vocabulary is a set of terms to be kept
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
lost = len(vocabulary)-len(active_vocabulary)
if lost > 0: # some terms are missing, so it will be replaced by UNK
print('warning: missing {} terms for lang {}'.format(lost, self.lang))
self.we = self.get_vectors(active_vocabulary)
assert self.we.shape[0] == len(active_vocabulary)
self.dimword={i:w for i,w in enumerate(active_vocabulary)}
self.worddim={w:i for i,w in enumerate(active_vocabulary)}
return self
@classmethod
def load_poly(cls, basedir, langs, lang_vocabularies=None, word_preprocessor=None):
if lang_vocabularies is None:
return cls.merge([cls.load(basedir,lang, word_preprocessor) for lang in langs])
else:
# assert all([l in lang_vocabularies for l in langs]), 'missing vocabulary for some languages'
return cls.merge([cls.load(basedir, lang, word_preprocessor).restrict(lang_vocabularies[lang]) for lang in langs])
@classmethod
def merge(cls, we_list):
assert all([isinstance(we, WordEmbeddings) for we in we_list]), \
'instances of {} expected'.format(WordEmbeddings.__name__)
polywe = []
worddim = {}
offset = 0
for we in we_list:
polywe.append(we.we)
worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
offset = len(worddim)
polywe = np.vstack(polywe)
return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
class FastTextWikiNews(Vectors):
url_base = 'Cant auto-download MUSE embeddings'
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
path = '../embeddings/wiki.multi.{}.vec'
_name = '/wiki.multi.{}.vec'
def __init__(self, cache, language="en", **kwargs):
@ -159,42 +44,13 @@ class FastTextWikiNews(Vectors):
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
class EmbeddingsAligned(Vectors):
def __init__(self, type, path, lang, voc):
# todo - rewrite as relative path
self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
self.path = path + self.name.format(lang)
assert os.path.exists(path), f'pre-trained vectors not found in {path}'
super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
self.vectors = self.extract(voc)
def vocabulary(self):
return set(self.stoi.keys())
def extract(self, words):
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
extraction = torch.zeros((len(words), self.dim))
extraction[source_idx] = self.vectors[target_idx]
return extraction
def reduce(self, dim):
pca = PCA(n_components=dim)
self.vectors = pca.fit_transform(self.vectors)
return
class FastTextMUSE(PretrainedEmbeddings):
def __init__(self, path, lang, limit=None):
super().__init__()
print(f'Loading fastText pretrained vectors for language {lang} from {path}')
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
def vocabulary(self):
return set(self.embed.stoi.keys())
@ -204,114 +60,8 @@ class FastTextMUSE(PretrainedEmbeddings):
def extract(self, words):
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
extraction = torch.zeros((len(words), self.dim()))
# extraction = torch.empty(len(words), self.dim()).normal_(0, 1)
extraction[source_idx] = self.embed.vectors[target_idx]
return extraction
class StorageEmbeddings:
def __init__(self, path):
self.path = path
self.lang_U = dict()
self.lang_S = dict()
def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
for lang in docs.keys():
print(f'# [unsupervised-matrix {type}] for {lang}')
voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
nC = self.lang_U[lang].shape[1]
if max_label_space == 0:
print(f'Computing optimal number of PCA components along matrices U')
optimal_n = get_optimal_dim(self.lang_U, 'U')
self.lang_U = run_pca(optimal_n, self.lang_U)
elif max_label_space < nC:
print(f'Applying PCA to unsupervised matrix U')
self.lang_U = run_pca(max_label_space, self.lang_U)
return
def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
only_well_represented_C = False # TODO testing
if only_well_represented_C:
labels = labels.copy()
min_prevalence = 0
print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...')
langs = list(docs.keys())
well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs])
for lang in langs:
labels[lang] = labels[lang][:, well_repr_cats]
print(f'Target number reduced to: {labels[lang].shape[1]}\n')
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
print(f'# [supervised-matrix] for {lang}')
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
reduction, max_label_space, voc[lang], lang)
nC = self.lang_S[lang].shape[1]
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
if max_label_space == 0: # looking for best n_components analyzing explained_variance_ratio
print(f'Computing optimal number of PCA components along matrices S')
optimal_n = get_optimal_dim(self.lang_S, 'S')
print(f'Applying PCA(n_components={optimal_n})')
self.lang_S = run_pca(optimal_n, self.lang_S)
elif max_label_space == -1: # applying pca to the verticals stacked matrix of WCE embeddings
print(f'Computing PCA on vertical stacked WCE embeddings')
languages = self.lang_S.keys()
_temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) # stacking WCE vertically
stacked_pca = PCA(n_components=_temp_stack.shape[1])
stacked_pca.fit(_temp_stack)
best_n = None
_r = stacked_pca.explained_variance_ratio_
_r = np.cumsum(_r)
plt.plot(_r, label='Stacked Supervised')
for i in range(len(_r) - 1, 1, -1):
delta = _r[i] - _r[i - 1]
if delta > 0:
best_n = i
break
plt.show()
stacked_pca = PCA(n_components=best_n)
stacked_pca.fit(_temp_stack)
print(f'Applying PCA(n_components={i}')
for lang in languages:
self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
elif max_label_space <= nC: # less or equal in order to reduce it to the same initial dimension
print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})')
self.lang_S = run_pca(max_label_space, self.lang_S)
return
def SIF_embeddings(self):
print('todo') # TODO
def _concatenate_embeddings(self, docs):
_r = dict()
for lang in self.lang_U.keys():
_r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang])))
return _r
def fit(self, config, docs, vocs, labels):
if config['unsupervised']:
self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
if config['supervised']:
self._add_embeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
return self
def predict(self, config, docs):
if config['supervised'] and config['unsupervised']:
return self._concatenate_embeddings(docs)
# todo testing applying pca to hstack muse + wce
# _reduced = self._concatenate_embeddings(docs)
# return run_pca(300, _reduced)
elif config['supervised']:
_r = dict()
for lang in docs.keys():
_r[lang] = docs[lang].dot(self.lang_S[lang])
else:
_r = dict()
for lang in docs.keys():
_r[lang] = docs[lang].dot(self.lang_U[lang])
return _r

View File

@ -1,103 +1,102 @@
from abc import ABC, abstractmethod
import torch, torchtext
import gensim
import os
# import gensim
# import os
import numpy as np
class KeyedVectors:
def __init__(self, word2index, weights):
assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
index2word = {i:w for w,i in word2index.items()}
assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
self.word2index = word2index
self.index2word = index2word
self.weights = weights
def extract(self, words):
dim = self.weights.shape[1]
v_size = len(words)
source_idx, target_idx = [], []
for i,word in enumerate(words):
if word not in self.word2index: continue
j = self.word2index[word]
source_idx.append(i)
target_idx.append(j)
extraction = np.zeros((v_size, dim))
extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
return extraction
# class KeyedVectors:
#
# def __init__(self, word2index, weights):
# assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
# index2word = {i:w for w,i in word2index.items()}
# assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
# self.word2index = word2index
# self.index2word = index2word
# self.weights = weights
#
# def extract(self, words):
# dim = self.weights.shape[1]
# v_size = len(words)
#
# source_idx, target_idx = [], []
# for i,word in enumerate(words):
# if word not in self.word2index: continue
# j = self.word2index[word]
# source_idx.append(i)
# target_idx.append(j)
#
# extraction = np.zeros((v_size, dim))
# extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
#
# return extraction
class PretrainedEmbeddings(ABC):
def __init__(self):
super().__init__()
@abstractmethod
def vocabulary(self): pass
@abstractmethod
def dim(self): pass
@classmethod
def reindex(cls, words, word2index):
source_idx, target_idx = [], []
for i, word in enumerate(words):
if word not in word2index: continue
j = word2index[word]
source_idx.append(i)
target_idx.append(j)
source_idx = np.asarray(source_idx)
target_idx = np.asarray(target_idx)
return source_idx, target_idx
# class PretrainedEmbeddings(ABC):
#
# def __init__(self):
# super().__init__()
#
# @abstractmethod
# def vocabulary(self): pass
#
# @abstractmethod
# def dim(self): pass
#
# @classmethod
# def reindex(cls, words, word2index):
# source_idx, target_idx = [], []
# for i, word in enumerate(words):
# if word not in word2index: continue
# j = word2index[word]
# source_idx.append(i)
# target_idx.append(j)
# source_idx = np.asarray(source_idx)
# target_idx = np.asarray(target_idx)
# return source_idx, target_idx
class GloVe(PretrainedEmbeddings):
def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
super().__init__()
print(f'Loading GloVe pretrained vectors from torchtext')
self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
print('Done')
def vocabulary(self):
return set(self.embed.stoi.keys())
def dim(self):
return self.embed.dim
def extract(self, words):
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
extraction = torch.zeros((len(words), self.dim()))
extraction[source_idx] = self.embed.vectors[target_idx]
return extraction
# class GloVe(PretrainedEmbeddings):
#
# def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
# super().__init__()
# print(f'Loading GloVe pretrained vectors from torchtext')
# self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
# print('Done')
#
# def vocabulary(self):
# return set(self.embed.stoi.keys())
#
# def dim(self):
# return self.embed.dim
#
# def extract(self, words):
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
# extraction = torch.zeros((len(words), self.dim()))
# extraction[source_idx] = self.embed.vectors[target_idx]
# return extraction
class Word2Vec(PretrainedEmbeddings):
def __init__(self, path, limit=None):
super().__init__()
print(f'Loading word2vec pretrained vectors from {path}')
assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
print('Done')
def vocabulary(self):
return set(self.word2index.keys())
def dim(self):
return self.embed.vector_size
def extract(self, words):
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
extraction = np.zeros((len(words), self.dim()))
extraction[source_idx] = self.embed.vectors[target_idx]
extraction = torch.from_numpy(extraction).float()
return extraction
# class Word2Vec(PretrainedEmbeddings):
#
# def __init__(self, path, limit=None):
# super().__init__()
# print(f'Loading word2vec pretrained vectors from {path}')
# assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
# self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
# self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
# print('Done')
#
# def vocabulary(self):
# return set(self.word2index.keys())
#
# def dim(self):
# return self.embed.vector_size
#
# def extract(self, words):
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
# extraction = np.zeros((len(words), self.dim()))
# extraction[source_idx] = self.embed.vectors[target_idx]
# extraction = torch.from_numpy(extraction).float()
# return extraction

View File

@ -1,7 +1,5 @@
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
import numpy as np
# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
@ -69,31 +67,6 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la
return F
# if nC >= max_label_space:
# if reduction == 'PCA':
# if max_label_space == 0:
# pca = PCA(n_components=Y.shape[1])
# pca = pca.fit(F)
# return pca.explained_variance_ratio_
#
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
# f'Applying PCA(n_components={max_label_space})')
# pca = PCA(n_components=max_label_space)
# F = pca.fit_transform(F)
# elif reduction == 'TSNE':
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
# f'Applying t-SNE(n_components={max_label_space})')
# tsne = TSNE(n_components=max_label_space)
# F = tsne.fit_transform(F)
# elif reduction == 'tSVD':
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
# f'Applying truncatedSVD(n_components={max_label_space})')
# tSVD = TruncatedSVD(n_components=max_label_space)
# F = tSVD.fit_transform(F)
#
# return F

View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
logfile=../log/log10run_dl_jrc.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
done

View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
logfile=../log/log10run_dl_rcv.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
done

View File

@ -0,0 +1,12 @@
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
logfile=./results/10run_jrc_final_results.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
done

View File

@ -0,0 +1,16 @@
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
logfile=./results/funnelling_10run_jrc_CIKM.csv
runs='6 7 8 9' #0 1 2 3 4 5
for run in $runs
do
dataset=$dataset_path$run.pickle
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated (done up to run5)
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2
done

View File

@ -0,0 +1,15 @@
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
logfile=./results/10run_rcv_final_results.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
done

View File

@ -0,0 +1,16 @@
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2
done

View File

@ -0,0 +1,34 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
logfile=./results/final_combinations_jrc.csv
#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
# aggregation=concatenation
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2
#
##FeatureSetToPosteriors (aggregation mean)
python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
##FeatureSetToPosteriors
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
#MajorityVoting
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r

View File

@ -0,0 +1,31 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
logfile=./results/final_combinations_rcv.csv
#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
# aggregation=concatenation
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2
#
##FeatureSetToPosteriors (aggregation mean)
python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
##FeatureSetToPosteriors
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
#MajorityVoting
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r

View File

@ -0,0 +1,31 @@
#!/usr/bin/env bash
logfile=../log/log_pre_jrc.csv
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20

View File

@ -0,0 +1,30 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20

View File

@ -0,0 +1,16 @@
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
seeds='5' #2 3 4 5 6 7 8 9 10'
for seed in $seeds
do
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed --force
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed --force
done

View File

@ -0,0 +1,20 @@
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
seeds='1 ' #2 3 4 5' # 6 7 8 9 10'
for seed in $seeds
do
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed
# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed
# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed
done

View File

@ -0,0 +1,45 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
######################################## POSTERIORS
# Posteriors
python main_multimodal_cls.py $dataset -P # + zscore
python main_multimodal_cls.py $dataset -P -z # +l2norm
python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
######################################### WCE
#WCE supervised
python main_multimodal_cls.py $dataset -S # + zscore
python main_multimodal_cls.py $dataset -S -z # +l2norm
python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 # +feature weight + pca
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 # + SIF
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig # + pca
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi
################################# MUSE
# MUSE unsupervised
python main_multimodal_cls.py $dataset -U # + zscore
python main_multimodal_cls.py $dataset -U -z # +l2norm
python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi

View File

@ -0,0 +1,45 @@
#!/usr/bin/env bash
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
######################################## POSTERIORS
# Posteriors
python main_multimodal_cls.py $dataset -P # + zscore
python main_multimodal_cls.py $dataset -P -z # +l2norm
python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
######################################### WCE
#WCE supervised
python main_multimodal_cls.py $dataset -S # + zscore
python main_multimodal_cls.py $dataset -S -z # +l2norm
python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 # +feature weight + pca
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 # + SIF
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig # + pca
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi
################################# MUSE
# MUSE unsupervised
python main_multimodal_cls.py $dataset -U # + zscore
python main_multimodal_cls.py $dataset -U -z # +l2norm
python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi

View File

@ -0,0 +1,6 @@
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
seeds='1 2 3 4 5 6 7 8 9 10'
for seed in $seeds
do
python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised --nepochs 50 --seed $seed
done

View File

@ -1,15 +1,15 @@
import numpy as np
import time
from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
# from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
# from sklearn.model_selection import KFold
from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers.StandardizeTransformer import StandardizeTransformer
from sklearn.decomposition import PCA
from models.cnn_class_bu import CNN_pdr
# from sklearn.feature_extraction.text import TfidfVectorizer
# from util_transformers.StandardizeTransformer import StandardizeTransformer
# from sklearn.decomposition import PCA
# from models.cnn_class_bu import CNN_pdr
def _sort_if_sparse(X):
@ -40,154 +40,154 @@ class TrivialRejector:
def best_params(self): return {}
class FunnellingPolylingualClassifier:
"""
This classifier projects each document d into a language-independent feature space where each dimension fi is the
decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
then trains one single classifier for all documents in this space, irrespective of their originary language
"""
def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
calmode='cal', n_jobs=-1):
"""
:param first_tier_learner: the learner used in the first-tier level
:param meta_learner: the learner used in the second-tier level
:param first_tier_parameters: parameters for the learner in the doc_projector
:param meta_parameters: parameters for the learner in the z-space
:param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
:param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
:param n_jobs: number of parallel threads
'sigmoid' to use the sigmoid of the decision_function
projects the data before training the final classifier; if greater than one, the training set is split in as
many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
models trained on the remaining folds. This should increase the generality of the space to unseen data.
"""
assert folded_projections>0, "positive number of folds expected"
assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
self.fist_tier_learner = first_tier_learner
self.meta_learner = meta_learner
self.fist_tier_parameters=first_tier_parameters
self.meta_parameters = meta_parameters
self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
self.folded_projections = folded_projections
self.n_jobs = n_jobs
self.calmode = calmode
def _projection(self, doc_projector, lX):
"""
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
decision_function if otherwise
:param doc_projector: the document projector (a NaivePolylingualClassifier)
:param lX: {lang:matrix} to train
:return: the projection, applied with predict_proba or decision_function
"""
if self.calmode=='cal':
return doc_projector.predict_proba(lX)
else:
l_decision_scores = doc_projector.decision_function(lX)
if self.calmode=='sigmoid':
def sigmoid(x): return 1 / (1 + np.exp(-x))
for lang in l_decision_scores.keys():
l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
return l_decision_scores
def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
"""
Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
decision scores (if otherwise). This space is here named zspace.
:param lXtr: {lang:matrix} to train
:param lYtr: {lang:labels} to train
:param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
:param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
:return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
models trained on lXtr, and the lYproj labels stacked consistently
"""
repair_empty_folds = True
if lXproj is None and lYproj is None:
lXproj, lYproj = lXtr, lYtr
repair_empty_folds = False
print('fitting the projectors... {}'.format(lXtr.keys()))
self.doc_projector.fit(lXtr, lYtr)
print('projecting the documents')
langs = list(lXtr.keys())
lZ = self._projection(self.doc_projector, lXproj)
# if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
empty_categories = self.doc_projector.empty_categories
lZ_bu = self._projection(self.doc_projector_bu, lXproj)
for lang in langs:
repair = empty_categories[lang]
lZ[lang][:,repair] = lZ_bu[lang][:,repair]
Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
zy = np.vstack([lYproj[lang] for lang in langs])
return Z, zy
def _get_zspace_folds(self, lX, ly):
self.doc_projector_bu.fit(lX, ly)
print('split of {} folds'.format(self.folded_projections))
skf = KFold(n_splits=self.folded_projections, shuffle=True)
Z, zy = [], []
lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
for fold in range(self.folded_projections):
print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
lfoldXtr, lfoldYtr = {}, {}
lfoldXte, lfoldYte = {}, {}
for lang in lX.keys():
train, test = lfold[lang][fold]
lfoldXtr[lang] = lX[lang][train]
lfoldYtr[lang] = ly[lang][train]
lfoldXte[lang] = lX[lang][test]
lfoldYte[lang] = ly[lang][test]
Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
Z.append(Zfold)
zy.append(zYfold)
# compose the Z-space as the union of all folded predictions
Z = np.vstack(Z)
zy = np.vstack(zy)
# refit the document projector with all examples to have a more reliable projector for test data
self.doc_projector = self.doc_projector_bu
return Z, zy
def fit(self, lX, ly, lZ=None, lzy=None):
tinit = time.time()
Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
#experimental: adds the posterior probabilities (computed outside) to the meta-classifier
if lZ is not None and lzy is not None:
zlangs = list(lZ.keys())
Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
print('fitting the Z-space of shape={}'.format(Z.shape))
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
self.model.fit(Z, zy)
self.time = time.time() - tinit
return self
def predict(self, lX, lZ=None):
"""
:param lX: a dictionary {language_label: X csr-matrix}
:param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
:return: a dictionary of predictions
"""
lZ_ = self._projection(self.doc_projector, lX)
if lZ is not None:
lZ_ = {**lZ_, **lZ}
return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
def best_params(self):
params = self.doc_projector.best_params()
params['meta'] = self.model.best_params()
return params
# class FunnellingPolylingualClassifier:
# """
# This classifier projects each document d into a language-independent feature space where each dimension fi is the
# decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
# then trains one single classifier for all documents in this space, irrespective of their originary language
# """
# def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
# calmode='cal', n_jobs=-1):
# """
# :param first_tier_learner: the learner used in the first-tier level
# :param meta_learner: the learner used in the second-tier level
# :param first_tier_parameters: parameters for the learner in the doc_projector
# :param meta_parameters: parameters for the learner in the z-space
# :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
# :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
# :param n_jobs: number of parallel threads
# 'sigmoid' to use the sigmoid of the decision_function
# projects the data before training the final classifier; if greater than one, the training set is split in as
# many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
# models trained on the remaining folds. This should increase the generality of the space to unseen data.
# """
# assert folded_projections>0, "positive number of folds expected"
# assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
# assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
#
# self.fist_tier_learner = first_tier_learner
# self.meta_learner = meta_learner
# self.fist_tier_parameters=first_tier_parameters
# self.meta_parameters = meta_parameters
# self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
# self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
# self.folded_projections = folded_projections
# self.n_jobs = n_jobs
# self.calmode = calmode
#
# def _projection(self, doc_projector, lX):
# """
# Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
# decision_function if otherwise
# :param doc_projector: the document projector (a NaivePolylingualClassifier)
# :param lX: {lang:matrix} to train
# :return: the projection, applied with predict_proba or decision_function
# """
# if self.calmode=='cal':
# return doc_projector.predict_proba(lX)
# else:
# l_decision_scores = doc_projector.decision_function(lX)
# if self.calmode=='sigmoid':
# def sigmoid(x): return 1 / (1 + np.exp(-x))
# for lang in l_decision_scores.keys():
# l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
# return l_decision_scores
#
# def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
# """
# Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
# decision scores (if otherwise). This space is here named zspace.
# :param lXtr: {lang:matrix} to train
# :param lYtr: {lang:labels} to train
# :param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
# :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
# :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
# models trained on lXtr, and the lYproj labels stacked consistently
# """
# repair_empty_folds = True
# if lXproj is None and lYproj is None:
# lXproj, lYproj = lXtr, lYtr
# repair_empty_folds = False
#
# print('fitting the projectors... {}'.format(lXtr.keys()))
# self.doc_projector.fit(lXtr, lYtr)
#
# print('projecting the documents')
# langs = list(lXtr.keys())
# lZ = self._projection(self.doc_projector, lXproj)
#
# # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
# empty_categories = self.doc_projector.empty_categories
# lZ_bu = self._projection(self.doc_projector_bu, lXproj)
#
# for lang in langs:
# repair = empty_categories[lang]
# lZ[lang][:,repair] = lZ_bu[lang][:,repair]
#
# Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
# zy = np.vstack([lYproj[lang] for lang in langs])
# return Z, zy
#
# def _get_zspace_folds(self, lX, ly):
# self.doc_projector_bu.fit(lX, ly)
#
# print('split of {} folds'.format(self.folded_projections))
# skf = KFold(n_splits=self.folded_projections, shuffle=True)
#
# Z, zy = [], []
# lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
# for fold in range(self.folded_projections):
# print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
# lfoldXtr, lfoldYtr = {}, {}
# lfoldXte, lfoldYte = {}, {}
# for lang in lX.keys():
# train, test = lfold[lang][fold]
# lfoldXtr[lang] = lX[lang][train]
# lfoldYtr[lang] = ly[lang][train]
# lfoldXte[lang] = lX[lang][test]
# lfoldYte[lang] = ly[lang][test]
# Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
# Z.append(Zfold)
# zy.append(zYfold)
# # compose the Z-space as the union of all folded predictions
# Z = np.vstack(Z)
# zy = np.vstack(zy)
# # refit the document projector with all examples to have a more reliable projector for test data
# self.doc_projector = self.doc_projector_bu
# return Z, zy
#
# def fit(self, lX, ly, lZ=None, lzy=None):
# tinit = time.time()
# Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
#
# #experimental: adds the posterior probabilities (computed outside) to the meta-classifier
# if lZ is not None and lzy is not None:
# zlangs = list(lZ.keys())
# Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
# zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
#
# print('fitting the Z-space of shape={}'.format(Z.shape))
# self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
# self.model.fit(Z, zy)
# self.time = time.time() - tinit
#
# return self
#
# def predict(self, lX, lZ=None):
# """
# :param lX: a dictionary {language_label: X csr-matrix}
# :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
# :return: a dictionary of predictions
# """
# lZ_ = self._projection(self.doc_projector, lX)
# if lZ is not None:
# lZ_ = {**lZ_, **lZ}
# return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
#
# def best_params(self):
# params = self.doc_projector.best_params()
# params['meta'] = self.model.best_params()
# return params
class NaivePolylingualClassifier:
@ -322,411 +322,4 @@ class MonolingualClassifier:
return self.model.predict(X)
def best_params(self):
return self.best_params_
class FunnellingMultimodal(FunnellingPolylingualClassifier):
def __init__(self,
we_path,
config,
first_tier_learner,
meta_learner,
first_tier_parameters=None,
meta_parameters=None,
folded_projections=1,
calmode='cal',
n_jobs=-1):
super().__init__(first_tier_learner,
meta_learner,
first_tier_parameters,
meta_parameters,
folded_projections,
calmode,
n_jobs)
self.pca_independent_space = PCA(n_components=50)
self.we_path = we_path
self.config = config
self.lang_word2idx = dict()
self.languages = []
self.lang_tfidf = {}
self.embedding_space = None
self.model = None
self.time = None
self.best_components = 'not set' # if auto optimize pca, it will store the optimal number of components
def vectorize(self, lX, prediction=False):
langs = list(lX.keys())
print(f'# tfidf-vectorizing docs')
if prediction:
for lang in langs:
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
tfidf_vectorizer = self.lang_tfidf[lang]
lX[lang] = tfidf_vectorizer.transform(lX[lang])
return self
for lang in langs:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
self.languages.append(lang)
tfidf_vectorizer.fit(lX[lang])
lX[lang] = tfidf_vectorizer.transform(lX[lang])
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
self.lang_tfidf[lang] = tfidf_vectorizer
return self
def _get_zspace(self, lXtr, lYtr):
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
self.doc_projector.fit(lXtr, lYtr)
print('\nprojecting the documents')
lZ = self._projection(self.doc_projector, lXtr)
return lZ, lYtr
def fit(self, lX, ly):
tinit = time.time()
print('Vectorizing documents...')
self.vectorize(lX)
for lang in self.languages:
print(f'{lang}->{lX[lang].shape}')
Z, zy = self._get_zspace(lX, ly)
if self.config['supervised'] or self.config['unsupervised']:
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
_embedding_space = self.embedding_space.transform(self.config, lX)
if self.config['max_label_space'] == 0:
_cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1]
if _cum_dimension - 300 > 0:
_temp = _cum_dimension - 300
else:
_temp = _cum_dimension
self.best_components = _temp
# h_stacking posterior probabilities with (U) and/or (S) matrices
for lang in self.languages:
Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
# stacking Z space vertically
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
self.standardizer = StandardizeTransformer()
_vertical_Z = self.standardizer.fit_transform(_vertical_Z)
# todo testing ...
# if self.config['post_pca']:
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
# self.pca_independent_space.fit(_vertical_Z)
# _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
n_jobs=self.n_jobs)
self.model.fit(_vertical_Z, _vertical_Zy)
self.time = time.time() - tinit
print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
def predict(self, lX, ly):
print('Vectorizing documents')
self.vectorize(lX, prediction=True)
lZ = self._projection(self.doc_projector, lX)
if self.config['supervised'] or self.config['unsupervised']:
_embedding_space = self.embedding_space.transform(self.config, lX)
for lang in lX.keys():
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
for lang in lZ.keys():
print(lZ[lang].shape)
# todo testing
lZ[lang] = self.standardizer.transform(lZ[lang])
# if self.config['post_pca']:
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
class PolylingualEmbeddingsClassifier:
"""
This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article
@article{conneau2017word,
title={Word translation without parallel data},
author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}},
journal={arXiv preprint arXiv:1710.04087},
year={2017}
}
url: https://github.com/facebookresearch/MUSE
"""
def __init__(self, wordembeddings_path, config, learner, c_parameters=None, n_jobs=-1):
"""
:param wordembeddings_path: the path to the directory containing the polylingual embeddings
:param learner: the learner
:param c_parameters: parameters for learner
:param n_jobs: the number of concurrent threads
"""
self.wordembeddings_path = wordembeddings_path
self.config = config
self.learner = learner
self.c_parameters=c_parameters
self.n_jobs = n_jobs
self.lang_tfidf = {}
self.model = None
self.languages = []
self.lang_word2idx = dict()
self.embedding_space = None
def fit_vectorizers(self, lX):
for lang in lX.keys():
if lang not in self.lang_tfidf:
tfidf = TfidfVectorizer(sublinear_tf=True, use_idf=True) # text is already processed
docs = lX[lang]
tfidf.fit(docs)
self.lang_tfidf[lang] = tfidf
def vectorize(self, lX, prediction=False):
langs = list(lX.keys())
print(f'# tfidf-vectorizing docs')
if prediction:
for lang in langs:
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
tfidf_vectorizer = self.lang_tfidf[lang]
lX[lang] = tfidf_vectorizer.transform(lX[lang])
return self
for lang in langs:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
self.languages.append(lang)
tfidf_vectorizer.fit(lX[lang])
lX[lang] = tfidf_vectorizer.transform(lX[lang])
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
self.lang_tfidf[lang] = tfidf_vectorizer
return self
def embed(self, docs, lang):
assert lang in self.lang_tfidf, 'unknown language'
tfidf_vectorizer = self.lang_tfidf[lang]
V = tfidf_vectorizer.vocabulary_
Xweights = tfidf_vectorizer.transform(docs)
print('loading word embeddings for ' + lang)
we = WordEmbeddings.load(self.wordembeddings_path, lang)
nD = len(docs)
doc_vecs = np.zeros((nD, we.dim()))
for i, doc in enumerate(docs):
print('\r\tcomplete {:.3f}%'.format(100 * (i + 1) / nD), end='')
# averaging with tfidf (summing each word only once, since the frequency is already controlled)
for w in set(doc.split()):
if w in we and w in V:
doc_vecs[i] += (we[w] * Xweights[i, V[w]])
# works much worse with idf; works much worse with document l2-normalization
print()
return doc_vecs
def fit(self, lX, ly):
"""
:param lX: a dictionary {language_label: [list of preprocessed documents]}
:param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
:return: self
"""
tinit = time.time()
langs = list(lX.keys())
WEtr, Ytr = [], []
# self.fit_vectorizers(lX) # if already fit, does nothing
self.vectorize(lX)
# config = {'unsupervised' : False, 'supervised': True}
self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly)
WEtr = self.embedding_space.transform(self.config, lX)
# for lang in langs:
# WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices
# Ytr.append(ly[lang])
WEtr = np.vstack([WEtr[lang] for lang in langs])
Ytr = np.vstack([ly[lang] for lang in langs])
self.embed_time = time.time() - tinit
print('fitting the WE-space of shape={}'.format(WEtr.shape))
self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs)
self.model.fit(WEtr, Ytr)
self.time = time.time() - tinit
return self
def predict(self, lX, lY):
"""
:param lX: a dictionary {language_label: [list of preprocessed documents]}
"""
assert self.model is not None, 'predict called before fit'
self.vectorize(lX, prediction=True)
langs = list(lX.keys())
lWEte = self.embedding_space.transform(self.config, lX)
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
return _joblib_transform_multiling(self.model.transform, lWEte, n_jobs=self.n_jobs)
def predict_proba(self, lX):
"""
:param lX: a dictionary {language_label: [list of preprocessed documents]}
"""
assert self.model is not None, 'predict called before fit'
langs = list(lX.keys())
lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
return _joblib_transform_multiling(self.model.predict_proba, lWEte, n_jobs=self.n_jobs)
def best_params(self):
return self.model.best_params()
class MonolingualNetSvm:
"""
testing: funnelling with NN managing word embeddings compositionality. An ensemble of n-SVMs (n equals to the
number of training languages) is first fit on the data, generating the documents projection in the Z-space. Next,
the projection are fed to a single NN with their respective document embeddings. The documents are projected into
the embedding space while preserving their dimensionality (output dim is 300). These projection are horizonatally
concatenated with the respective projection and passed through a fC layer with sigmoid act and output dim equal
to the number of target classes.
# TODO ATM testing with only 1 language
"""
def __init__(self, lX, ly, first_tier_learner, first_tier_parameters, n_jobs):
self.lX = lX
self.ly = ly
# SVM Attributes
self.doc_projector = NaivePolylingualClassifier(first_tier_learner, first_tier_parameters,
n_jobs=n_jobs)
self.calmode = 'cal'
self.languages = []
self.lang_word2idx = dict()
self.lang_tfidf = {}
self.base_learner = 'TODO'
self.parameters = 'TODO'
# NN Attributes
self.NN = 'TODO'
def load_preprocessed(self):
"""
in order to speed up the process, documents are first tokenized in the "main". Here, tokenized docs, word_index, and
targets are loaded.
:return: dict[lang] = (word_index, tokenized_docs, targets)
"""
import pickle
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
return pickle.load(f)
def _build_embedding_matrix(self, lang, word_index):
"""
build embedding matrix by filtering out OOV embeddings
:param lang:
:param word_index:
:return: filtered embedding matrix
"""
from embeddings.embeddings import EmbeddingsAligned
type = 'MUSE'
path = '/home/andreapdr/CLESA/'
MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
return MUSE
def get_data_and_embed(self, data_dict):
from keras.preprocessing.sequence import pad_sequences
langs = data_dict.keys()
lang_embedding_matrix = dict()
nn_lXtr = dict()
nn_lytr = dict()
for lang in langs:
lang_embedding_matrix[lang] = self._build_embedding_matrix(lang, data_dict[lang][0])
nn_lXtr[lang] = pad_sequences(data_dict[lang][1], 100, padding='post')
nn_lytr[lang] = [data_dict[lang][2]]
return nn_lXtr, nn_lytr, lang_embedding_matrix
def svm_vectorize(self, lX, prediction=False):
langs = list(lX.keys())
print(f'# tfidf-vectorizing docs')
if prediction:
for lang in langs:
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
tfidf_vectorizer = self.lang_tfidf[lang]
lX[lang] = tfidf_vectorizer.transform(lX[lang])
return self
for lang in langs:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
self.languages.append(lang)
tfidf_vectorizer.fit(lX[lang])
lX[lang] = tfidf_vectorizer.transform(lX[lang])
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
self.lang_tfidf[lang] = tfidf_vectorizer
return lX
def _get_zspace(self, lXtr, lYtr):
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
self.doc_projector.fit(lXtr, lYtr)
print('\nprojecting the documents')
lZ = self._projection(self.doc_projector, lXtr)
return lZ, lYtr
def _projection(self, doc_projector, lX):
"""
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
decision_function if otherwise
:param doc_projector: the document projector (a NaivePolylingualClassifier)
:param lX: {lang:matrix} to train
:return: the projection, applied with predict_proba or decision_function
"""
if self.calmode=='cal':
return doc_projector.predict_proba(lX)
else:
l_decision_scores = doc_projector.decision_function(lX)
if self.calmode=='sigmoid':
def sigmoid(x): return 1 / (1 + np.exp(-x))
for lang in l_decision_scores.keys():
l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
return l_decision_scores
def fit(self):
"""
# 1. Fit SVM to generate posterior probabilities:
# 1.1 Gather documents and vectorize them as in other SVM classifiers
# 2. Fit NN
# 2.1 Gather documents and build NN dataset by indexing wrt embedding matrix
# 2.2 Fit NN first-layer to generate compositional doc embedding
# 2.3 H-stack doc-embed and posterior P
# 2.4 Feed stacked vector to output layer (sigmoid act): output Nc
# 2.5 Train it...
"""
# load pre-processed data
data_dict = self.load_preprocessed()
# build embedding matrices and neural network document training set
nn_lXtr, nn_lytr, lang_embedding_matrix = self.get_data_and_embed(data_dict)
# TF-IDF vectorzing documents for SVM classifier
svm_lX = self.svm_vectorize(self.lX)
# just testing on a smaller subset of data
test_svm_lX = dict()
test_svm_ly = dict()
test_svm_lX['it'] = svm_lX['it'][:10, :]
test_svm_ly['it'] = self.ly['it'][:10, :]
test_nn_data = nn_lXtr['it'][:10]
# projecting document into Z space by SVM
svm_Z, _ = self._get_zspace(test_svm_lX, test_svm_ly)
# initializing net and forward pass
net = CNN_pdr(73, 1, 300, len(lang_embedding_matrix['it'].vectors), 300, lang_embedding_matrix['it'].vectors)
out = net.forward(test_nn_data, svm_Z['it'])
print('TODO')
def net(self):
pass
return self.best_params_

View File

@ -10,7 +10,7 @@ import time
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
from scipy.sparse import issparse, vstack, hstack
from transformers.StandardizeTransformer import StandardizeTransformer
from util_transformers.StandardizeTransformer import StandardizeTransformer
from util.SIF_embed import remove_pc
from sklearn.preprocessing import normalize
from sklearn.svm import SVC
@ -127,22 +127,26 @@ class PosteriorProbabilitiesEmbedder:
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents')
return self.doc_projector.predict_proba(lX)
def _get_output_dim(self):
return len(self.doc_projector.model['da'].model.classes_)
class MuseEmbedder:
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight()):
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False):
self.path=path
self.lV = lV
self.l2 = l2
self.n_jobs = n_jobs
self.featureweight = featureweight
self.sif = sif
def fit(self, lX, ly, lV=None):
assert lV is not None or self.lV is not None, 'lV not specified'
self.langs = sorted(lX.keys())
self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE}
self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE.items()}
self.featureweight.fit(lX, ly)
return self
@ -150,7 +154,7 @@ class MuseEmbedder:
MUSE = self.MUSE
lX = self.featureweight.transform(lX)
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], MUSE[lang]) for lang in self.langs
delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs
)
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
lMuse = _normalize(lMuse, self.l2)
@ -162,14 +166,18 @@ class MuseEmbedder:
def _get_wordlist_from_word2index(self, word2index):
return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0]
def _get_output_dim(self):
return self.MUSE['da'].shape[1]
class WordClassEmbedder:
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight()):
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False):
self.n_jobs = n_jobs
self.l2 = l2
self.max_label_space=max_label_space
self.featureweight = featureweight
self.sif = sif
def fit(self, lX, ly, lV=None):
self.langs = sorted(lX.keys())
@ -184,7 +192,7 @@ class WordClassEmbedder:
lWCE = self.lWCE
lX = self.featureweight.transform(lX)
XdotWCE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], lWCE[lang])for lang in self.langs
delayed(XdotM)(lX[lang], lWCE[lang], self.sif)for lang in self.langs
)
lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
lwce = _normalize(lwce, self.l2)
@ -193,6 +201,9 @@ class WordClassEmbedder:
def fit_transform(self, lX, ly, lV=None):
return self.fit(lX, ly).transform(lX)
def _get_output_dim(self):
return 73
class DocEmbedderList:
@ -201,6 +212,7 @@ class DocEmbedderList:
if len(embedder_list)==0: embedder_list=[]
self.embedders = embedder_list
self.aggregation = aggregation
print(f'Aggregation mode: {self.aggregation}')
def fit(self, lX, ly, lV=None):
for transformer in self.embedders:
@ -238,16 +250,25 @@ class DocEmbedderList:
langs = sorted(lX.keys())
lZparts = {l: None for l in langs}
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
min_dim = 300
for transformer in self.embedders:
lZ = transformer.transform(lX)
nC = min([lZ[lang].shape[1] for lang in langs])
for l in langs:
Z = lZ[l]
if Z.shape[1] > min_dim:
print(f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
f'Applying PCA(n_components={min_dim})')
pca = PCA(n_components=min_dim)
Z = pca.fit(Z).transform(Z)
if lZparts[l] is None:
lZparts[l] = Z
else:
lZparts[l] += Z
n_transformers = len(self.embedders)
nC = min([lZparts[lang].shape[1] for lang in langs])
return {l:lZparts[l] / n_transformers for l in langs}
@ -266,7 +287,7 @@ class FeatureSet2Posteriors:
self.transformer = transformer
self.l2=l2
self.n_jobs = n_jobs
self.prob_classifier = MetaClassifier(SVC(kernel='rbf', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
self.prob_classifier = MetaClassifier(SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
def fit(self, lX, ly, lV=None):
if lV is None and hasattr(self.transformer, 'lV'):
@ -412,11 +433,13 @@ def word_class_embedding_matrix(X, Y, max_label_space=300):
return WCE
def XdotM(X,M):
def XdotM(X,M, sif):
# return X.dot(M)
# print(f'X={X.shape}, M={M.shape}')
print(f'X={X.shape}, M={M.shape}')
E = X.dot(M)
E = remove_pc(E, npc=1)
if sif:
print("removing pc...")
E = remove_pc(E, npc=1)
return E

View File

@ -1,92 +0,0 @@
from optparse import OptionParser
from util.results import PolylingualClassificationResults
from dataset_builder import MultilingualDataset
from keras.preprocessing.text import Tokenizer
from learning.learners import MonolingualNetSvm
from sklearn.svm import SVC
import pickle
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format",
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimize hyperparameters", default=False)
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
(op, args) = parser.parse_args()
###################################################################################################################
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
def get_params(dense=False):
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf' if dense else 'linear'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
# PREPROCESS TEXT AND SAVE IT ... both for SVM and NN
def preprocess_data(lXtr, lXte, lytr, lyte):
tokenized_tr = dict()
tokenized_te = dict()
for lang in lXtr.keys():
alltexts = ' '.join(lXtr[lang])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(alltexts.split(' '))
tokenizer.oov_token = len(tokenizer.word_index)+1
# dumping train set
sequences_tr = tokenizer.texts_to_sequences(lXtr[lang])
tokenized_tr[lang] = (tokenizer.word_index, sequences_tr, lytr[lang])
# dumping test set
sequences_te = tokenizer.texts_to_sequences(lXte[lang])
tokenized_te[lang] = (tokenizer.word_index, sequences_te, lyte[lang])
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'wb') as f:
pickle.dump(tokenized_tr, f)
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_test.pickle', 'wb') as f:
pickle.dump(tokenized_tr, f)
print('Successfully dumped data')
# def load_preprocessed():
# with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
# return pickle.load(f)
#
# def build_embedding_matrix(lang, word_index):
# type = 'MUSE'
# path = '/home/andreapdr/CLESA/'
# MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
# return MUSE
########## MAIN #################################################################################################
if __name__ == '__main__':
results = PolylingualClassificationResults('./results/NN_FPEC_results.csv')
data = MultilingualDataset.load(op.dataset)
lXtr, lytr = data.training()
lXte, lyte = data.test()
if op.set_c != -1:
meta_parameters = None
else:
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
test_architecture = MonolingualNetSvm(lXtr,
lytr,
first_tier_learner=get_learner(calibrate=True),
first_tier_parameters=None,
n_jobs=1)
test_architecture.fit()

View File

@ -1,6 +1,6 @@
import argparse
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from torch.optim.lr_scheduler import StepLR, MultiStepLR
from dataset_builder import MultilingualDataset
from learning.transformers import load_muse_embeddings
from models.lstm_class import RNNMultilingualClassifier
@ -9,8 +9,6 @@ from util.early_stop import EarlyStopping
from util.common import *
from util.file import create_if_not_exist
from time import time
from embeddings.pretrained import *
from os.path import join
from tqdm import tqdm
from util.evaluation import evaluate
from util.file import get_file_name
@ -100,7 +98,7 @@ def main():
# Loading the dataset
data = MultilingualDataset.load(opt.dataset)
# data.set_view(languages=['de', 'fr', 'sv', 'da', 'es', 'it'])
data.set_view(languages=['de', 'fr']) #, 'it', 'en']) # 'sv', 'da', 'es', 'it'])
data.show_dimensions()
langs = data.langs()
l_devel_raw, l_devel_target = data.training(target_as_csr=True)
@ -108,6 +106,7 @@ def main():
# Loading the MUSE pretrained embeddings (only if requested)
lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
# lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set
# Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
multilingual_index = MultilingualIndex()
@ -115,10 +114,26 @@ def main():
multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
multilingual_index.embedding_matrices(lpretrained, opt.supervised)
if opt.posteriors:
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=opt.svm_max_docs)
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000, store_posteriors=True) #stored_post=True) #opt.svm_max_docs)
else:
lPtr, lPva, lPte = None, None, None
# just_test = False
# if just_test:
#
# model = torch.load(
# '../checkpoint/rnn(H512)-Muse-WCE-Posteriors-(trainable)-jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')
# criterion = torch.nn.BCEWithLogitsLoss().cuda()
#
# # batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
#
# batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
# l_test_index = multilingual_index.l_test_index()
# epoch = 1
# tinit = time()
# test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
# exit('Loaded')
# Model initialization
model = init_Net(data.num_categories(), multilingual_index)
@ -130,7 +145,7 @@ def main():
tinit = time()
create_if_not_exist(opt.checkpoint_dir)
early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
l_train_index, l_train_target = multilingual_index.l_train()
l_val_index, l_val_target = multilingual_index.l_val()
@ -155,7 +170,6 @@ def main():
break
# training is over
# restores the best model according to the Mf1 of the validation set (only when plotmode==False)
# stoptime = early_stop.stop_time - tinit
# stopepoch = early_stop.best_epoch
@ -164,6 +178,8 @@ def main():
if opt.plotmode==False:
print('-' * 80)
print('Training over. Performing final evaluation')
# torch.cuda.empty_cache()
model = early_stop.restore_checkpoint()
if opt.val_epochs>0:
@ -183,10 +199,14 @@ def get_lr(optimizer):
def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name):
_dataset_path = opt.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
loss_history = []
model.train()
for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)):
optim.zero_grad()
_out = model(batch,post, lang)
loss = criterion(model(batch, post, lang), target)
loss.backward()
clip_gradient(model)
@ -195,7 +215,7 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,
if idx % opt.log_interval == 0:
interval_loss = np.mean(loss_history[-opt.log_interval:])
print(f'{opt.dataset} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
@ -203,6 +223,8 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,
def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix):
loss_history = []
model.eval()
langs = sorted(ltest_index.keys())
predictions = {l:[] for l in langs}
@ -214,6 +236,7 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
prediction = predict(logits)
predictions[lang].append(prediction)
yte_stacked[lang].append(target.detach().cpu().numpy())
loss_history.append(loss)
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l:np.vstack(predictions[l]) for l in langs}
@ -224,17 +247,15 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix=='te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
# (config['max_label_space'], classifier.best_components),
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
# lang, macrof1, microf1, macrok, microk, '')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=tend)
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mf1, timelapse=tend)
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-accuracy', value=acc, timelapse=tend)
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=loss, timelapse=tend)
mean_loss = np.mean(loss_history)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
return Mf1

View File

@ -1,7 +1,7 @@
import os
from dataset_builder import MultilingualDataset
# from learning.learners import *
from learning.learners import FunnellingMultimodal
# from learning.learners import FunnellingMultimodal
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting
from util.evaluation import *
@ -14,14 +14,14 @@ from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format",
default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
# parser.add_option("-d", "--dataset", dest="dataset",
# help="Path to the multilingual dataset processed and stored in .pickle format",
# default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv')
parser.add_option("-P", "--probs", dest="probs", action='store_true',
parser.add_option("-P", "--probs", dest="posteriors", action='store_true',
help="Add posterior probabilities to the document embedding representation", default=False)
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
@ -46,6 +46,9 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
default=300)
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
help="Remove common component when computing dot product of word embedding matrices", default=False)
# parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
# " If set to 0 it will automatically search for the best number of components", default=300)
@ -72,15 +75,18 @@ def get_params(dense=False):
if __name__ == '__main__':
(op, args) = parser.parse_args()
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
dataset = args[0]
dataset_file = os.path.basename(op.dataset)
assert exists(dataset), 'Unable to find file '+str(dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
dataset_file = os.path.basename(dataset)
results = PolylingualClassificationResults(op.output)
data = MultilingualDataset.load(op.dataset)
data = MultilingualDataset.load(dataset)
data.show_dimensions()
lXtr, lytr = data.training()
@ -88,8 +94,9 @@ if __name__ == '__main__':
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
# result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \
f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}'
print(f'{result_id}')
# text preprocessing
@ -100,7 +107,7 @@ if __name__ == '__main__':
lV = tfidfvectorizer.vocabulary()
classifiers = []
if op.probs:
if op.posteriors:
classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
if op.supervised:
classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S)))
@ -115,13 +122,37 @@ if __name__ == '__main__':
print('\n# Evaluating ...')
l_eval = evaluate_method(classifier, lXte, lyte)
# renaming arguments to be printed on log
_id = ''
_id_conf = [op.posteriors, op.supervised, op.pretrained]
_id_name = ['+P', '+W', '+M']
for i, conf in enumerate(_id_conf):
if conf:
_id += _id_name[i]
_id = _id.lstrip('+')
_dataset_path = dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
# (config['max_label_space'], classifier.best_components),
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
# lang, macrof1, microf1, macrok, microk, '')
results.add_row(method='Voting',
learner='svm',
optimp=op.optimc,
sif=op.sif,
zscore='todo',
l2='todo',
wescaler='todo',
pca=op.max_labels_S,
id=_id,
dataset=dataset_id,
time='todo',
lang=lang,
macrof1=macrof1,
microf1=microf1,
macrok=macrok,
microk=microk,
notes='')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

View File

@ -11,7 +11,7 @@ from sklearn.svm import SVC
parser = OptionParser(usage="usage: %prog datapath [options]")
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv')
help="Result file", type=str, default='multiModal_log.csv')
parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true',
help="Add posterior probabilities to the document embedding representation", default=False)
@ -22,8 +22,8 @@ parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
parser.add_option("--nol2", dest="nol2", action='store_true',
help="Deactivates l2 normalization as a post-processing for the document embedding views", default=False)
parser.add_option("--l2", dest="l2", action='store_true',
help="Activates l2 normalization as a post-processing for the document embedding views", default=False)
parser.add_option("--allprob", dest="allprob", action='store_true',
help="All views are generated as posterior probabilities. This affects the supervised and pretrained "
@ -48,11 +48,28 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
default=300)
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
help="Remove common component when computing dot product of word embedding matrices", default=False)
parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
help="Z-score normalize matrices (WCE and MUSE)", default=False)
parser.add_option("-a", "--agg", dest="agg", action='store_true',
help="Set aggregation function of the common Z-space to average (Default: concatenation)", default=False)
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
def get_params():
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
#######################################################################################################################
@ -64,17 +81,23 @@ if __name__ == '__main__':
assert exists(dataset), 'Unable to find file '+str(dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
l2=(op.nol2==False)
l2=op.l2
dataset_file = os.path.basename(dataset)
results = PolylingualClassificationResults(op.output)
results = PolylingualClassificationResults('../log/' + op.output)
allprob='Prob' if op.allprob else ''
result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \
f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}{"_optimC" if op.optimc else ""}'
f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}_zscore={op.zscore}{"_optimC" if op.optimc else ""}'
print(f'{result_id}')
# set zscore range - is slice(0,0) mean will be equal to 0 and std to 1, thus normalization will have no effect
standardize_range = slice(0,0)
if op.zscore:
standardize_range = None
data = MultilingualDataset.load(dataset)
# data.set_view(languages=['fr', 'it'])
data.show_dimensions()
lXtr, lytr = data.training()
lXte, lyte = data.test()
@ -86,23 +109,23 @@ if __name__ == '__main__':
feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
# # document embedding modules
doc_embedder = DocEmbedderList(aggregation='concat')
doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat')
if op.posteriors:
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2))
if op.supervised:
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting)
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
if op.allprob:
wce = FeatureSet2Posteriors(wce, l2=l2)
doc_embedder.append(wce)
if op.pretrained:
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting)
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
if op.allprob:
muse = FeatureSet2Posteriors(muse, l2=l2)
doc_embedder.append(muse)
# metaclassifier
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=meta_parameters)
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=get_params(), standardize_range=standardize_range)
# ensembling the modules
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
@ -113,13 +136,40 @@ if __name__ == '__main__':
print('\n# Evaluating ...')
l_eval = evaluate_method(classifier, lXte, lyte)
# renaming arguments to be printed on log
_id = ''
_id_conf = [op.posteriors, op.supervised, op.pretrained]
_id_name = ['+P', '+W', '+M']
for i, conf in enumerate(_id_conf):
if conf:
_id += _id_name[i]
_id = _id.lstrip('+')
_id = _id if not op.agg else _id + '_mean'
_id = _id if not op.allprob else _id + '_allprob'
_dataset_path = dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
# (config['max_label_space'], classifier.best_components),
# config['dim_reduction_unsupervised'], op.optimc, dataset.split('/')[-1], classifier.time,
# lang, macrof1, microf1, macrok, microk, '')
results.add_row(method='MultiModal',
learner='svm',
optimp=op.optimc,
sif= op.sif,
zscore=op.zscore,
l2= op.l2,
wescaler= op.feat_weight,
pca=op.max_labels_S,
id=_id,
dataset=dataset_id,
time='todo',
lang=lang,
macrof1=macrof1,
microf1=microf1,
macrok=macrok,
microk=microk,
notes='')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

View File

@ -27,7 +27,7 @@ class RNNMultilingualClassifier(nn.Module):
self.n_layers = 1
self.n_directions = 1
self.dropout = nn.Dropout(0.2)
self.dropout = nn.Dropout(0.6)
lstm_out = 256
ff1 = 512
@ -45,7 +45,7 @@ class RNNMultilingualClassifier(nn.Module):
llearnable_embeddings[l] = learnable_embeddings
self.embedding_length = embedding_length
# self.rnn = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
# self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
self.rnn = nn.GRU(self.embedding_length, hidden_size)
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
self.lpretrained_embeddings.update(lpretrained_embeddings)

355
src/new_mbert.py Normal file
View File

@ -0,0 +1,355 @@
"""
Test with smaller subset of languages.
1. Load doc (RCV1/2)
2. Tokenize texts via bertTokenizer (I should already have these dumps)
3. Construct better Dataloader/Datasets. NB: I need to keep track of the languages only for
the testing phase (but who cares actually? If I have to do it for the testing phase, I think
it is better to deploy it also in the training phase...)
4. ...
5. I have to understand if the pooled hidden state of the last layer is way worse than its averaged
version (However, in BertForSeqClassification I guess that the pooled version is passed through
the output linear layer in order to get the prediction scores?)
6. At the same time, I have to build also an end-to-end model in order to fine-tune it. The previous step
would be useful when deploying mBert as a View Generator. (Refactor gFun code with view generators?)
7. ...
8. Profits
"""
from dataset_builder import MultilingualDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
from util.common import clip_gradient, predict
from time import time
from util.csv_log import CSVLog
from util.evaluation import evaluate
from util.early_stop import EarlyStopping
from torch.optim.lr_scheduler import StepLR
from sklearn.model_selection import train_test_split
import argparse
def get_model(n_out):
print('# Initializing model ...')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
return model
def set_method_name():
return 'mBERT'
def init_optimizer(model, lr):
# return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)],
'weight_decay': opt.weight_decay},
{'params': [p for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)],
'weight_decay': opt.weight_decay}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
return optimizer
def init_logfile(method_name, opt):
logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
logfile.set_default('dataset', opt.dataset)
logfile.set_default('run', opt.seed)
logfile.set_default('method', method_name)
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated'
return logfile
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def get_dataset_name(datapath):
possible_splits = [str(i) for i in range(10)]
splitted = datapath.split('_')
id_split = splitted[-1].split('.')[0][-1]
if id_split in possible_splits:
dataset_name = splitted[0].split('/')[-1]
return f'{dataset_name}_run{id_split}'
def load_datasets(datapath):
data = MultilingualDataset.load(datapath)
data.set_view(languages=['nl']) # Testing with just two langs
data.show_dimensions()
l_devel_raw, l_devel_target = data.training(target_as_csr=False)
l_test_raw, l_test_target = data.test(target_as_csr=False)
return l_devel_raw, l_devel_target, l_test_raw, l_test_target
def do_tokenization(l_dataset, max_len=512):
print('# Starting Tokenization ...')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
langs = l_dataset.keys()
l_tokenized = {}
for lang in langs:
l_tokenized[lang] = tokenizer(l_dataset[lang],
truncation=True,
max_length=max_len,
add_special_tokens=True,
padding='max_length')
return l_tokenized
class TrainingDataset(Dataset):
"""
data: dict of lang specific tokenized data
labels: dict of lang specific targets
"""
def __init__(self, data, labels):
self.langs = data.keys()
self.lang_ids = {lang:identifier for identifier, lang in enumerate(self.langs)}
for i, lang in enumerate(self.langs):
# print(lang)
_data = data[lang]['input_ids']
_data = np.array(_data)
_labels = labels[lang]
_lang_value = np.full(len(_data), self.lang_ids[lang])
if i == 0:
self.data = _data
self.labels = _labels
self.lang_index = _lang_value
else:
self.data = np.vstack((self.data, _data))
self.labels = np.vstack((self.labels, _labels))
self.lang_index = np.concatenate((self.lang_index, _lang_value))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.data[idx]
y = self.labels[idx]
lang = self.lang_index[idx]
return x, torch.tensor(y, dtype=torch.float), lang
# return x, y, lang
def get_lang_ids(self):
return self.lang_ids
def freeze_encoder(model):
for param in model.base_model.parameters():
param.requires_grad = False
return model
def check_param_grad_status(model):
print('#'*50)
print('Model paramater status')
for name, child in model.named_children():
trainable = False
for param in child.parameters():
if param.requires_grad:
trainable = True
if not trainable:
print(f'{name} is frozen')
else:
print(f'{name} is not frozen')
print('#'*50)
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile):
_dataset_path = opt.dataset.split('/')[-1].split('_')
# dataset_id = 'RCV1/2_run0_newBert'
dataset_id = _dataset_path[0] + _dataset_path[-1]
loss_history = []
model.train()
for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
# optim.zero_grad()
out = model(batch.cuda())
loss = criterion(out[0], target.cuda())
loss.backward()
clip_gradient(model)
optim.step()
loss_history.append(loss.item())
if idx % opt.log_interval == 0:
interval_loss = np.mean(loss_history[-opt.log_interval:])
print(
f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
return mean_loss
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix):
print('# Validating model ...')
loss_history = []
model.eval()
langs = lang_ids.keys()
id_2_lang = {v:k for k,v in lang_ids.items()}
predictions = {l: [] for l in langs}
yte_stacked = {l: [] for l in langs}
for batch, target, lang_idx in test_dataloader:
out = model(batch.cuda())
logits = out[0]
loss = criterion(logits, target.cuda()).item()
prediction = predict(logits)
loss_history.append(loss)
# Assigning prediction to dict in predictionS and yte_stacked according to lang_idx
for i, pred in enumerate(prediction):
lang_pred = id_2_lang[lang_idx.numpy()[i]]
predictions[lang_pred].append(pred)
yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
ly = {l: np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l: np.vstack(predictions[l]) for l in langs}
l_eval = evaluate(ly, ly_)
metrics = []
for lang in langs:
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix == 'te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
mean_loss = np.mean(loss_history)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
return Mf1
def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
l_split_va = l_tokenized_tr
l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
l_split_tr = l_tokenized_tr
l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
for lang in l_tokenized_tr.keys():
val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[lang] = \
train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, random_state=seed, shuffle=True)
return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
def main():
print('Running main ...')
DATAPATH = opt.dataset
method_name = set_method_name()
logfile = init_logfile(method_name, opt)
l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
l_tokenized_tr = do_tokenization(l_devel_raw, max_len=512)
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop=0.2, max_val=2000, seed=opt.seed)
l_tokenized_te = do_tokenization(l_test_raw, max_len=512)
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=False)
te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False)
# Initializing model
model = get_model(73)
model = model.cuda()
criterion = torch.nn.BCEWithLogitsLoss().cuda()
optim = init_optimizer(model, lr=opt.lr)
# lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_dataset_name(opt.dataset)}')
# lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optim, num_warmup_steps= , num_training_steps=)
# print(model)
# Freezing encoder
# model = freeze_encoder(model)
check_param_grad_status(model)
# Training loop
tinit = time()
lang_ids = va_dataset.lang_ids
for epoch in range(1, opt.nepochs+1):
print('# Start Training ...')
train(model, tr_dataloader, epoch, criterion, optim, 'TestingBert', tinit, logfile)
# lr_scheduler.step(epoch=None) # reduces the learning rate
# validation
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
early_stop(macrof1, epoch)
if opt.test_each>0:
if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
if early_stop.STOP:
print('[early-stop] STOP')
if not opt.plotmode:
break
if opt.plotmode==False:
print('-' * 80)
print('Training over. Performing final evaluation')
model = early_stop.restore_checkpoint()
if opt.val_epochs>0:
print(f'running last {opt.val_epochs} training epochs on the validation set')
for val_epoch in range(1, opt.val_epochs + 1):
train(model, va_dataloader, epoch+val_epoch, criterion, optim, 'TestingBert', tinit, logfile)
# final test
print('Training complete: testing')
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
exit('Code Executed!')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model')
parser.add_argument('--dataset', type=str, default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
metavar='datasetpath', help=f'path to the pickled dataset')
parser.add_argument('--nepochs', type=int, default=200, metavar='int',
help='number of epochs (default: 200)')
parser.add_argument('--lr', type=float, default=2e-5, metavar='float',
help='learning rate (default: 2e-5)')
parser.add_argument('--weight_decay', type=float, default=0, metavar='float',
help='weight decay (default: 0)')
parser.add_argument('--patience', type=int, default=10, metavar='int',
help='patience for early-stop (default: 10)')
parser.add_argument('--log-interval', type=int, default=20, metavar='int',
help='how many batches to wait before printing training status')
parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str',
help='path to the log csv file')
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
parser.add_argument('--force', action='store_true', default=False,
help='do not check if this experiment has already been run')
parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str',
help='path to the directory containing checkpoints')
parser.add_argument('--plotmode', action='store_true', default=False,
help='in plot mode executes a long run in order '
'to generate enough data to produce trend plots (test-each should be >0. This mode is '
'used to produce plots, and does not perform an evaluation on the test set.')
parser.add_argument('--test-each', type=int, default=0, metavar='int',
help='how many epochs to wait before invoking test (default: 0, only at the end)')
parser.add_argument('--val-epochs', type=int, default=1, metavar='int',
help='number of training epochs to perform on the validation set once training is over (default 1)')
opt = parser.parse_args()
# Testing different parameters ...
opt.weight_decay = 0.01
opt.patience = 5
main()
# TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size

View File

@ -1,7 +1,11 @@
import pandas as pd
import numpy as np
df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t')
pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'embed'], aggfunc=[np.mean, np.std])
print(pivot)
print('Finished ...')
# df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t')
df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t')
pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std])
with pd.option_context('display.max_rows', None):
print(pivot.round(3))
print('Finished ...')

11
src/run_mbert_rcv.sh Normal file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
logfile=../log/log_Mbert_rcv.csv
runs='0 1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
python new_mbert.py --dataset $dataset --log-file $logfile --test-each 20
done

View File

@ -17,7 +17,7 @@ def get_weighted_average(We, x, w):
def compute_pc(X,npc=1):
"""
Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
Compute the principal components.
:param X: X[i,:] is a data point
:param npc: number of principal components to remove
:return: component_[i,:] is the i-th pc

View File

@ -1,4 +1,5 @@
import warnings
import time
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
@ -143,6 +144,15 @@ class Index:
embedding_parts.append(F)
make_dumps = False
if make_dumps:
print(f'Dumping Embedding Matrices ...')
import pickle
with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile:
pickle.dump((self.lang, embedding_parts, self.word2index), outfile)
with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2:
pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2)
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
@ -155,6 +165,7 @@ class MultilingualIndex:
def __init__(self): #, add_language_trace=False):
self.l_index = {}
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
# self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000)
# self.add_language_trace=add_language_trace
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
@ -189,30 +200,42 @@ class MultilingualIndex:
# pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1)
def posterior_probabilities(self, max_training_docs_by_lang=5000):
def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
# choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
timeit = time.time()
lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
for l in self.langs:
n_elements = lXtr[l].shape[0]
if n_elements > max_training_docs_by_lang:
choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
lXtr[l] = lXtr[l][choice]
lYtr[l] = lYtr[l][choice]
if not stored_post:
for l in self.langs:
n_elements = lXtr[l].shape[0]
if n_elements > max_training_docs_by_lang:
choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
lXtr[l] = lXtr[l][choice]
lYtr[l] = lYtr[l][choice]
# train the posterior probabilities embedder
print('[posteriors] training a calibrated SVM')
learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
prob_embedder.fit(lXtr, lYtr)
# train the posterior probabilities embedder
print('[posteriors] training a calibrated SVM')
learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
prob_embedder.fit(lXtr, lYtr)
# transforms the training, validation, and test sets into posterior probabilities
print('[posteriors] generating posterior probabilities')
lPtr = prob_embedder.transform(self.get_lXtr())
lPva = prob_embedder.transform(self.get_lXva())
lPte = prob_embedder.transform(self.get_lXte())
print('[posteriors] done')
# transforms the training, validation, and test sets into posterior probabilities
print('[posteriors] generating posterior probabilities')
lPtr = prob_embedder.transform(self.get_lXtr())
lPva = prob_embedder.transform(self.get_lXva())
lPte = prob_embedder.transform(self.get_lXte())
# NB: Check splits indices !
if store_posteriors:
import pickle
with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
pickle.dump([lPtr, lPva, lPte], outfile)
print(f'Successfully dumped posteriors!')
else:
import pickle
with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
lPtr, lPva, lPte = pickle.load(infile)
print(f'Successfully loaded stored posteriors!')
print(f'[posteriors] done in {time.time() - timeit}')
return lPtr, lPva, lPte
def get_lXtr(self):

View File

@ -6,7 +6,7 @@ from util.file import create_if_not_exist
class EarlyStopping:
def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
self.patience_limit = patience
self.patience = patience
@ -16,9 +16,10 @@ class EarlyStopping:
self.stop_time = None
self.checkpoint = checkpoint
self.model = model
self.optimizer = optimizer
self.STOP = False
def __call__(self, watch_score, epoch):
def __call__(self, watch_score, epoch): #model
if self.STOP: return #done
@ -29,6 +30,9 @@ class EarlyStopping:
if self.checkpoint:
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
torch.save(self.model, self.checkpoint)
# with open(self.checkpoint)
# torch.save({'state_dict': self.model.state_dict(),
# 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint)
else:
self.print(f'[early-stop] improved')
self.patience = self.patience_limit
@ -46,6 +50,7 @@ class EarlyStopping:
self.patience=self.patience_limit
def restore_checkpoint(self):
print(f'restoring best model from epoch {self.best_epoch}...')
return torch.load(self.checkpoint)
def print(self, msg):

View File

@ -5,8 +5,23 @@ import numpy as np
class PolylingualClassificationResults:
def __init__(self, file, autoflush=True, verbose=False):
self.file = file
self.columns = ['method', 'learner', 'embed', 'embed_type', 'pca_s', 'pca_u', 'optimp', 'dataset', 'time',
'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
self.columns = ['method',
'learner',
'optimp',
'sif',
'zscore',
'l2',
'wescaler',
'pca',
'id',
'dataset',
'time',
'lang',
'macrof1',
'microf1',
'macrok',
'microk',
'notes']
self.autoflush = autoflush
self.verbose = verbose
if os.path.exists(file):
@ -21,8 +36,8 @@ class PolylingualClassificationResults:
def already_calculated(self, id):
return (self.df['id'] == id).any()
def add_row(self, method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
s = pd.Series([method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
self.df = self.df.append(s, ignore_index=True)
if self.autoflush: self.flush()
self.tell(s.to_string())