baseline multilingual Bert
This commit is contained in:
parent
22b7ea7e66
commit
d1fdad5f6e
|
|
@ -1,10 +1,7 @@
|
|||
import os
|
||||
import pickle
|
||||
from torchtext.vocab import Vectors
|
||||
import torch
|
||||
from abc import ABC, abstractmethod
|
||||
from embeddings.supervised import get_supervised_embeddings
|
||||
from util.decompositions import *
|
||||
from util.SIF_embed import *
|
||||
|
||||
|
||||
|
|
@ -35,122 +32,10 @@ class PretrainedEmbeddings(ABC):
|
|||
return source_idx, target_idx
|
||||
|
||||
|
||||
class WordEmbeddings:
|
||||
|
||||
def __init__(self, lang, we, worddim):
|
||||
self.lang = lang
|
||||
self.we = we
|
||||
self.worddim = worddim
|
||||
self.dimword = {v:k for k,v in self.worddim.items()}
|
||||
|
||||
@classmethod
|
||||
def load(cls, basedir, lang, word_preprocessor=None, dopickle=True):
|
||||
filename = 'wiki.multi.{}.vec'.format(lang)
|
||||
we_path = os.path.join(basedir, filename)
|
||||
|
||||
if dopickle and os.path.exists(we_path + '.pkl'):
|
||||
print('loading pkl in {}'.format(we_path + '.pkl'))
|
||||
(worddim, we) = pickle.load(open(we_path + '.pkl', 'rb'))
|
||||
else:
|
||||
word_registry = set()
|
||||
lines = open(we_path).readlines()
|
||||
nwords, dims = [int(x) for x in lines[0].split()]
|
||||
print('reading we of {} dimensions'.format(dims))
|
||||
we = np.zeros((nwords, dims), dtype=float)
|
||||
worddim = {}
|
||||
index = 0
|
||||
for i, line in enumerate(lines[1:]):
|
||||
if (i + 1) % 100 == 0:
|
||||
print('\r{}/{}'.format(i + 1, len(lines)), end='')
|
||||
word, *vals = line.split()
|
||||
wordp = word_preprocessor(word) if word_preprocessor is not None else word
|
||||
if wordp:
|
||||
wordp = wordp[0]
|
||||
if wordp in word_registry:
|
||||
print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp))
|
||||
elif len(vals) == dims:
|
||||
worddim[wordp] = index
|
||||
we[index, :] = np.array(vals).astype(float)
|
||||
index += 1
|
||||
# else:
|
||||
# print('warning: word <{}> generates an empty string after preprocessing'.format(word))
|
||||
we = we[:index]
|
||||
print('load {} words'.format(index))
|
||||
if dopickle:
|
||||
print('saving...')
|
||||
pickle.dump((worddim, we), open(we_path + '.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return WordEmbeddings(lang, we, worddim)
|
||||
|
||||
def vocabulary(self):
|
||||
return set(self.worddim.keys())
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.we[self.worddim[key]]
|
||||
|
||||
def dim(self):
|
||||
return self.we.shape[1]
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self.worddim
|
||||
|
||||
def most_similar(self, word_vect, k):
|
||||
if word_vect.ndim == 1:
|
||||
word_vect = word_vect.reshape(1,-1)
|
||||
assert word_vect.shape[1] == self.dim(), 'inconsistent dimensions'
|
||||
|
||||
sim = np.dot(word_vect,self.we.T)
|
||||
order = np.argsort(-1*sim, axis=1)[:,:k]
|
||||
|
||||
similar_words = [[self.dimword[order[vi,ki]] for ki in range(k)] for vi in range(word_vect.shape[0])]
|
||||
sim_scores = sim[:,order]
|
||||
return similar_words, sim_scores
|
||||
|
||||
def get_vectors(self, wordlist):
|
||||
indexes = np.array([self.worddim[w] for w in wordlist])
|
||||
return self.we[indexes]
|
||||
|
||||
def restrict(self, vocabulary):
|
||||
# vocabulary is a set of terms to be kept
|
||||
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
|
||||
lost = len(vocabulary)-len(active_vocabulary)
|
||||
if lost > 0: # some terms are missing, so it will be replaced by UNK
|
||||
print('warning: missing {} terms for lang {}'.format(lost, self.lang))
|
||||
self.we = self.get_vectors(active_vocabulary)
|
||||
assert self.we.shape[0] == len(active_vocabulary)
|
||||
self.dimword={i:w for i,w in enumerate(active_vocabulary)}
|
||||
self.worddim={w:i for i,w in enumerate(active_vocabulary)}
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def load_poly(cls, basedir, langs, lang_vocabularies=None, word_preprocessor=None):
|
||||
if lang_vocabularies is None:
|
||||
return cls.merge([cls.load(basedir,lang, word_preprocessor) for lang in langs])
|
||||
else:
|
||||
# assert all([l in lang_vocabularies for l in langs]), 'missing vocabulary for some languages'
|
||||
return cls.merge([cls.load(basedir, lang, word_preprocessor).restrict(lang_vocabularies[lang]) for lang in langs])
|
||||
|
||||
@classmethod
|
||||
def merge(cls, we_list):
|
||||
assert all([isinstance(we, WordEmbeddings) for we in we_list]), \
|
||||
'instances of {} expected'.format(WordEmbeddings.__name__)
|
||||
|
||||
polywe = []
|
||||
worddim = {}
|
||||
offset = 0
|
||||
for we in we_list:
|
||||
polywe.append(we.we)
|
||||
worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
|
||||
offset = len(worddim)
|
||||
polywe = np.vstack(polywe)
|
||||
|
||||
return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
|
||||
|
||||
|
||||
class FastTextWikiNews(Vectors):
|
||||
|
||||
url_base = 'Cant auto-download MUSE embeddings'
|
||||
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
|
||||
path = '../embeddings/wiki.multi.{}.vec'
|
||||
_name = '/wiki.multi.{}.vec'
|
||||
|
||||
def __init__(self, cache, language="en", **kwargs):
|
||||
|
|
@ -159,42 +44,13 @@ class FastTextWikiNews(Vectors):
|
|||
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
||||
|
||||
|
||||
|
||||
class EmbeddingsAligned(Vectors):
|
||||
|
||||
def __init__(self, type, path, lang, voc):
|
||||
# todo - rewrite as relative path
|
||||
self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
|
||||
self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
|
||||
self.path = path + self.name.format(lang)
|
||||
assert os.path.exists(path), f'pre-trained vectors not found in {path}'
|
||||
super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
|
||||
self.vectors = self.extract(voc)
|
||||
|
||||
def vocabulary(self):
|
||||
return set(self.stoi.keys())
|
||||
|
||||
def extract(self, words):
|
||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
|
||||
extraction = torch.zeros((len(words), self.dim))
|
||||
extraction[source_idx] = self.vectors[target_idx]
|
||||
return extraction
|
||||
|
||||
def reduce(self, dim):
|
||||
pca = PCA(n_components=dim)
|
||||
self.vectors = pca.fit_transform(self.vectors)
|
||||
return
|
||||
|
||||
|
||||
class FastTextMUSE(PretrainedEmbeddings):
|
||||
|
||||
def __init__(self, path, lang, limit=None):
|
||||
super().__init__()
|
||||
print(f'Loading fastText pretrained vectors for language {lang} from {path}')
|
||||
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
|
||||
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
|
||||
|
||||
|
||||
def vocabulary(self):
|
||||
return set(self.embed.stoi.keys())
|
||||
|
||||
|
|
@ -204,114 +60,8 @@ class FastTextMUSE(PretrainedEmbeddings):
|
|||
def extract(self, words):
|
||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
||||
extraction = torch.zeros((len(words), self.dim()))
|
||||
# extraction = torch.empty(len(words), self.dim()).normal_(0, 1)
|
||||
extraction[source_idx] = self.embed.vectors[target_idx]
|
||||
return extraction
|
||||
|
||||
|
||||
class StorageEmbeddings:
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
self.lang_U = dict()
|
||||
self.lang_S = dict()
|
||||
|
||||
def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
|
||||
for lang in docs.keys():
|
||||
print(f'# [unsupervised-matrix {type}] for {lang}')
|
||||
voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
|
||||
self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
|
||||
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
|
||||
nC = self.lang_U[lang].shape[1]
|
||||
if max_label_space == 0:
|
||||
print(f'Computing optimal number of PCA components along matrices U')
|
||||
optimal_n = get_optimal_dim(self.lang_U, 'U')
|
||||
self.lang_U = run_pca(optimal_n, self.lang_U)
|
||||
elif max_label_space < nC:
|
||||
print(f'Applying PCA to unsupervised matrix U')
|
||||
self.lang_U = run_pca(max_label_space, self.lang_U)
|
||||
|
||||
return
|
||||
|
||||
def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
|
||||
only_well_represented_C = False # TODO testing
|
||||
if only_well_represented_C:
|
||||
labels = labels.copy()
|
||||
min_prevalence = 0
|
||||
print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...')
|
||||
langs = list(docs.keys())
|
||||
well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs])
|
||||
for lang in langs:
|
||||
labels[lang] = labels[lang][:, well_repr_cats]
|
||||
print(f'Target number reduced to: {labels[lang].shape[1]}\n')
|
||||
|
||||
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
|
||||
print(f'# [supervised-matrix] for {lang}')
|
||||
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
|
||||
reduction, max_label_space, voc[lang], lang)
|
||||
nC = self.lang_S[lang].shape[1]
|
||||
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
||||
|
||||
if max_label_space == 0: # looking for best n_components analyzing explained_variance_ratio
|
||||
print(f'Computing optimal number of PCA components along matrices S')
|
||||
optimal_n = get_optimal_dim(self.lang_S, 'S')
|
||||
print(f'Applying PCA(n_components={optimal_n})')
|
||||
self.lang_S = run_pca(optimal_n, self.lang_S)
|
||||
elif max_label_space == -1: # applying pca to the verticals stacked matrix of WCE embeddings
|
||||
print(f'Computing PCA on vertical stacked WCE embeddings')
|
||||
languages = self.lang_S.keys()
|
||||
_temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) # stacking WCE vertically
|
||||
stacked_pca = PCA(n_components=_temp_stack.shape[1])
|
||||
stacked_pca.fit(_temp_stack)
|
||||
best_n = None
|
||||
_r = stacked_pca.explained_variance_ratio_
|
||||
_r = np.cumsum(_r)
|
||||
plt.plot(_r, label='Stacked Supervised')
|
||||
for i in range(len(_r) - 1, 1, -1):
|
||||
delta = _r[i] - _r[i - 1]
|
||||
if delta > 0:
|
||||
best_n = i
|
||||
break
|
||||
plt.show()
|
||||
stacked_pca = PCA(n_components=best_n)
|
||||
stacked_pca.fit(_temp_stack)
|
||||
print(f'Applying PCA(n_components={i}')
|
||||
for lang in languages:
|
||||
self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
|
||||
elif max_label_space <= nC: # less or equal in order to reduce it to the same initial dimension
|
||||
print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})')
|
||||
self.lang_S = run_pca(max_label_space, self.lang_S)
|
||||
|
||||
return
|
||||
|
||||
def SIF_embeddings(self):
|
||||
print('todo') # TODO
|
||||
|
||||
def _concatenate_embeddings(self, docs):
|
||||
_r = dict()
|
||||
for lang in self.lang_U.keys():
|
||||
_r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang])))
|
||||
return _r
|
||||
|
||||
def fit(self, config, docs, vocs, labels):
|
||||
if config['unsupervised']:
|
||||
self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
|
||||
if config['supervised']:
|
||||
self._add_embeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
|
||||
return self
|
||||
|
||||
def predict(self, config, docs):
|
||||
if config['supervised'] and config['unsupervised']:
|
||||
return self._concatenate_embeddings(docs)
|
||||
# todo testing applying pca to hstack muse + wce
|
||||
# _reduced = self._concatenate_embeddings(docs)
|
||||
# return run_pca(300, _reduced)
|
||||
elif config['supervised']:
|
||||
_r = dict()
|
||||
for lang in docs.keys():
|
||||
_r[lang] = docs[lang].dot(self.lang_S[lang])
|
||||
else:
|
||||
_r = dict()
|
||||
for lang in docs.keys():
|
||||
_r[lang] = docs[lang].dot(self.lang_U[lang])
|
||||
|
||||
return _r
|
||||
|
|
|
|||
|
|
@ -1,103 +1,102 @@
|
|||
from abc import ABC, abstractmethod
|
||||
import torch, torchtext
|
||||
import gensim
|
||||
import os
|
||||
# import gensim
|
||||
# import os
|
||||
import numpy as np
|
||||
|
||||
|
||||
class KeyedVectors:
|
||||
|
||||
def __init__(self, word2index, weights):
|
||||
assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
|
||||
index2word = {i:w for w,i in word2index.items()}
|
||||
assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
|
||||
self.word2index = word2index
|
||||
self.index2word = index2word
|
||||
self.weights = weights
|
||||
|
||||
def extract(self, words):
|
||||
dim = self.weights.shape[1]
|
||||
v_size = len(words)
|
||||
|
||||
source_idx, target_idx = [], []
|
||||
for i,word in enumerate(words):
|
||||
if word not in self.word2index: continue
|
||||
j = self.word2index[word]
|
||||
source_idx.append(i)
|
||||
target_idx.append(j)
|
||||
|
||||
extraction = np.zeros((v_size, dim))
|
||||
extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
|
||||
|
||||
return extraction
|
||||
# class KeyedVectors:
|
||||
#
|
||||
# def __init__(self, word2index, weights):
|
||||
# assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
|
||||
# index2word = {i:w for w,i in word2index.items()}
|
||||
# assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
|
||||
# self.word2index = word2index
|
||||
# self.index2word = index2word
|
||||
# self.weights = weights
|
||||
#
|
||||
# def extract(self, words):
|
||||
# dim = self.weights.shape[1]
|
||||
# v_size = len(words)
|
||||
#
|
||||
# source_idx, target_idx = [], []
|
||||
# for i,word in enumerate(words):
|
||||
# if word not in self.word2index: continue
|
||||
# j = self.word2index[word]
|
||||
# source_idx.append(i)
|
||||
# target_idx.append(j)
|
||||
#
|
||||
# extraction = np.zeros((v_size, dim))
|
||||
# extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
|
||||
#
|
||||
# return extraction
|
||||
|
||||
|
||||
|
||||
class PretrainedEmbeddings(ABC):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@abstractmethod
|
||||
def vocabulary(self): pass
|
||||
|
||||
@abstractmethod
|
||||
def dim(self): pass
|
||||
|
||||
@classmethod
|
||||
def reindex(cls, words, word2index):
|
||||
source_idx, target_idx = [], []
|
||||
for i, word in enumerate(words):
|
||||
if word not in word2index: continue
|
||||
j = word2index[word]
|
||||
source_idx.append(i)
|
||||
target_idx.append(j)
|
||||
source_idx = np.asarray(source_idx)
|
||||
target_idx = np.asarray(target_idx)
|
||||
return source_idx, target_idx
|
||||
# class PretrainedEmbeddings(ABC):
|
||||
#
|
||||
# def __init__(self):
|
||||
# super().__init__()
|
||||
#
|
||||
# @abstractmethod
|
||||
# def vocabulary(self): pass
|
||||
#
|
||||
# @abstractmethod
|
||||
# def dim(self): pass
|
||||
#
|
||||
# @classmethod
|
||||
# def reindex(cls, words, word2index):
|
||||
# source_idx, target_idx = [], []
|
||||
# for i, word in enumerate(words):
|
||||
# if word not in word2index: continue
|
||||
# j = word2index[word]
|
||||
# source_idx.append(i)
|
||||
# target_idx.append(j)
|
||||
# source_idx = np.asarray(source_idx)
|
||||
# target_idx = np.asarray(target_idx)
|
||||
# return source_idx, target_idx
|
||||
|
||||
|
||||
class GloVe(PretrainedEmbeddings):
|
||||
|
||||
def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
|
||||
super().__init__()
|
||||
print(f'Loading GloVe pretrained vectors from torchtext')
|
||||
self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
|
||||
print('Done')
|
||||
|
||||
def vocabulary(self):
|
||||
return set(self.embed.stoi.keys())
|
||||
|
||||
def dim(self):
|
||||
return self.embed.dim
|
||||
|
||||
def extract(self, words):
|
||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
||||
extraction = torch.zeros((len(words), self.dim()))
|
||||
extraction[source_idx] = self.embed.vectors[target_idx]
|
||||
return extraction
|
||||
# class GloVe(PretrainedEmbeddings):
|
||||
#
|
||||
# def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
|
||||
# super().__init__()
|
||||
# print(f'Loading GloVe pretrained vectors from torchtext')
|
||||
# self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
|
||||
# print('Done')
|
||||
#
|
||||
# def vocabulary(self):
|
||||
# return set(self.embed.stoi.keys())
|
||||
#
|
||||
# def dim(self):
|
||||
# return self.embed.dim
|
||||
#
|
||||
# def extract(self, words):
|
||||
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
||||
# extraction = torch.zeros((len(words), self.dim()))
|
||||
# extraction[source_idx] = self.embed.vectors[target_idx]
|
||||
# return extraction
|
||||
|
||||
|
||||
class Word2Vec(PretrainedEmbeddings):
|
||||
|
||||
def __init__(self, path, limit=None):
|
||||
super().__init__()
|
||||
print(f'Loading word2vec pretrained vectors from {path}')
|
||||
assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
|
||||
self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
|
||||
self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
|
||||
print('Done')
|
||||
|
||||
def vocabulary(self):
|
||||
return set(self.word2index.keys())
|
||||
|
||||
def dim(self):
|
||||
return self.embed.vector_size
|
||||
|
||||
def extract(self, words):
|
||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
|
||||
extraction = np.zeros((len(words), self.dim()))
|
||||
extraction[source_idx] = self.embed.vectors[target_idx]
|
||||
extraction = torch.from_numpy(extraction).float()
|
||||
return extraction
|
||||
# class Word2Vec(PretrainedEmbeddings):
|
||||
#
|
||||
# def __init__(self, path, limit=None):
|
||||
# super().__init__()
|
||||
# print(f'Loading word2vec pretrained vectors from {path}')
|
||||
# assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
|
||||
# self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
|
||||
# self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
|
||||
# print('Done')
|
||||
#
|
||||
# def vocabulary(self):
|
||||
# return set(self.word2index.keys())
|
||||
#
|
||||
# def dim(self):
|
||||
# return self.embed.vector_size
|
||||
#
|
||||
# def extract(self, words):
|
||||
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
|
||||
# extraction = np.zeros((len(words), self.dim()))
|
||||
# extraction[source_idx] = self.embed.vectors[target_idx]
|
||||
# extraction = torch.from_numpy(extraction).float()
|
||||
# return extraction
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,5 @@
|
|||
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
|
||||
import numpy as np
|
||||
# from sklearn.decomposition import PCA
|
||||
# from sklearn.manifold import TSNE
|
||||
|
||||
|
||||
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
||||
|
|
@ -69,31 +67,6 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la
|
|||
|
||||
return F
|
||||
|
||||
# if nC >= max_label_space:
|
||||
# if reduction == 'PCA':
|
||||
# if max_label_space == 0:
|
||||
# pca = PCA(n_components=Y.shape[1])
|
||||
# pca = pca.fit(F)
|
||||
# return pca.explained_variance_ratio_
|
||||
#
|
||||
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
# f'Applying PCA(n_components={max_label_space})')
|
||||
# pca = PCA(n_components=max_label_space)
|
||||
# F = pca.fit_transform(F)
|
||||
# elif reduction == 'TSNE':
|
||||
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
# f'Applying t-SNE(n_components={max_label_space})')
|
||||
# tsne = TSNE(n_components=max_label_space)
|
||||
# F = tsne.fit_transform(F)
|
||||
# elif reduction == 'tSVD':
|
||||
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
# f'Applying truncatedSVD(n_components={max_label_space})')
|
||||
# tSVD = TruncatedSVD(n_components=max_label_space)
|
||||
# F = tSVD.fit_transform(F)
|
||||
#
|
||||
# return F
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,11 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
|
||||
logfile=../log/log10run_dl_jrc.csv
|
||||
|
||||
runs='0 1 2 3 4 5 6 7 8 9'
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
|
||||
done
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||
logfile=../log/log10run_dl_rcv.csv
|
||||
|
||||
runs='0 1 2 3 4 5 6 7 8 9'
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
|
||||
done
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
|
||||
logfile=./results/10run_jrc_final_results.csv
|
||||
|
||||
runs='0 1 2 3 4 5 6 7 8 9'
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
|
||||
python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
|
||||
python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
|
||||
|
||||
done
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
|
||||
logfile=./results/funnelling_10run_jrc_CIKM.csv
|
||||
|
||||
runs='6 7 8 9' #0 1 2 3 4 5
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated (done up to run5)
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2
|
||||
done
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||
logfile=./results/10run_rcv_final_results.csv
|
||||
|
||||
runs='0 1 2 3 4 5 6 7 8 9'
|
||||
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
|
||||
python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
|
||||
python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
|
||||
|
||||
done
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||
logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv
|
||||
|
||||
runs='0 1 2 3 4 5 6 7 8 9'
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2
|
||||
done
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
|
||||
logfile=./results/final_combinations_jrc.csv
|
||||
#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
|
||||
# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
|
||||
# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
|
||||
|
||||
# aggregation=concatenation
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2
|
||||
#
|
||||
|
||||
##FeatureSetToPosteriors (aggregation mean)
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
|
||||
python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
|
||||
|
||||
##FeatureSetToPosteriors
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
|
||||
|
||||
#MajorityVoting
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
|
||||
logfile=./results/final_combinations_rcv.csv
|
||||
#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
|
||||
# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
|
||||
# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
|
||||
|
||||
# aggregation=concatenation
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2
|
||||
#
|
||||
##FeatureSetToPosteriors (aggregation mean)
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
|
||||
python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
|
||||
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
|
||||
|
||||
##FeatureSetToPosteriors
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob
|
||||
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
|
||||
|
||||
#MajorityVoting
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
|
||||
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
logfile=../log/log_pre_jrc.csv
|
||||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
||||
python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
|
||||
python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
||||
python main_deep_learning.py $dataset --supervised --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
|
||||
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
|
||||
seeds='5' #2 3 4 5 6 7 8 9 10'
|
||||
for seed in $seeds
|
||||
do
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
|
||||
python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed --force
|
||||
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
|
||||
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed --force
|
||||
|
||||
done
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
|
||||
seeds='1 ' #2 3 4 5' # 6 7 8 9 10'
|
||||
for seed in $seeds
|
||||
do
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
|
||||
python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200
|
||||
|
||||
|
||||
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
|
||||
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed
|
||||
|
||||
# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed
|
||||
# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200
|
||||
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed
|
||||
done
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
|
||||
|
||||
######################################## POSTERIORS
|
||||
# Posteriors
|
||||
python main_multimodal_cls.py $dataset -P # + zscore
|
||||
python main_multimodal_cls.py $dataset -P -z # +l2norm
|
||||
python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
|
||||
|
||||
|
||||
######################################### WCE
|
||||
#WCE supervised
|
||||
python main_multimodal_cls.py $dataset -S # + zscore
|
||||
python main_multimodal_cls.py $dataset -S -z # +l2norm
|
||||
python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
|
||||
python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
|
||||
|
||||
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 # +feature weight + pca
|
||||
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 # + SIF
|
||||
|
||||
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
|
||||
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
|
||||
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig # + pca
|
||||
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig
|
||||
|
||||
|
||||
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi
|
||||
|
||||
################################# MUSE
|
||||
|
||||
# MUSE unsupervised
|
||||
python main_multimodal_cls.py $dataset -U # + zscore
|
||||
python main_multimodal_cls.py $dataset -U -z # +l2norm
|
||||
python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
|
||||
python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
|
||||
|
||||
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
|
||||
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
|
||||
|
||||
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
|
||||
|
||||
######################################## POSTERIORS
|
||||
# Posteriors
|
||||
python main_multimodal_cls.py $dataset -P # + zscore
|
||||
python main_multimodal_cls.py $dataset -P -z # +l2norm
|
||||
python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
|
||||
|
||||
|
||||
######################################### WCE
|
||||
#WCE supervised
|
||||
python main_multimodal_cls.py $dataset -S # + zscore
|
||||
python main_multimodal_cls.py $dataset -S -z # +l2norm
|
||||
python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
|
||||
python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
|
||||
|
||||
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 # +feature weight + pca
|
||||
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 # + SIF
|
||||
|
||||
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
|
||||
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
|
||||
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig # + pca
|
||||
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig
|
||||
|
||||
|
||||
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi
|
||||
|
||||
################################# MUSE
|
||||
|
||||
# MUSE unsupervised
|
||||
python main_multimodal_cls.py $dataset -U # + zscore
|
||||
python main_multimodal_cls.py $dataset -U -z # +l2norm
|
||||
python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
|
||||
python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
|
||||
|
||||
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
|
||||
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
|
||||
|
||||
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
|
||||
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
|
||||
seeds='1 2 3 4 5 6 7 8 9 10'
|
||||
for seed in $seeds
|
||||
do
|
||||
python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised --nepochs 50 --seed $seed
|
||||
done
|
||||
|
|
@ -1,15 +1,15 @@
|
|||
import numpy as np
|
||||
import time
|
||||
from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
|
||||
# from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.model_selection import KFold
|
||||
# from sklearn.model_selection import KFold
|
||||
from joblib import Parallel, delayed
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from transformers.StandardizeTransformer import StandardizeTransformer
|
||||
from sklearn.decomposition import PCA
|
||||
from models.cnn_class_bu import CNN_pdr
|
||||
# from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
# from util_transformers.StandardizeTransformer import StandardizeTransformer
|
||||
# from sklearn.decomposition import PCA
|
||||
# from models.cnn_class_bu import CNN_pdr
|
||||
|
||||
|
||||
def _sort_if_sparse(X):
|
||||
|
|
@ -40,154 +40,154 @@ class TrivialRejector:
|
|||
def best_params(self): return {}
|
||||
|
||||
|
||||
class FunnellingPolylingualClassifier:
|
||||
"""
|
||||
This classifier projects each document d into a language-independent feature space where each dimension fi is the
|
||||
decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
|
||||
then trains one single classifier for all documents in this space, irrespective of their originary language
|
||||
"""
|
||||
def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
|
||||
calmode='cal', n_jobs=-1):
|
||||
"""
|
||||
:param first_tier_learner: the learner used in the first-tier level
|
||||
:param meta_learner: the learner used in the second-tier level
|
||||
:param first_tier_parameters: parameters for the learner in the doc_projector
|
||||
:param meta_parameters: parameters for the learner in the z-space
|
||||
:param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
|
||||
:param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
|
||||
:param n_jobs: number of parallel threads
|
||||
'sigmoid' to use the sigmoid of the decision_function
|
||||
projects the data before training the final classifier; if greater than one, the training set is split in as
|
||||
many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
|
||||
models trained on the remaining folds. This should increase the generality of the space to unseen data.
|
||||
"""
|
||||
assert folded_projections>0, "positive number of folds expected"
|
||||
assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
|
||||
assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
|
||||
|
||||
self.fist_tier_learner = first_tier_learner
|
||||
self.meta_learner = meta_learner
|
||||
self.fist_tier_parameters=first_tier_parameters
|
||||
self.meta_parameters = meta_parameters
|
||||
self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
|
||||
self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
|
||||
self.folded_projections = folded_projections
|
||||
self.n_jobs = n_jobs
|
||||
self.calmode = calmode
|
||||
|
||||
def _projection(self, doc_projector, lX):
|
||||
"""
|
||||
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
|
||||
decision_function if otherwise
|
||||
:param doc_projector: the document projector (a NaivePolylingualClassifier)
|
||||
:param lX: {lang:matrix} to train
|
||||
:return: the projection, applied with predict_proba or decision_function
|
||||
"""
|
||||
if self.calmode=='cal':
|
||||
return doc_projector.predict_proba(lX)
|
||||
else:
|
||||
l_decision_scores = doc_projector.decision_function(lX)
|
||||
if self.calmode=='sigmoid':
|
||||
def sigmoid(x): return 1 / (1 + np.exp(-x))
|
||||
for lang in l_decision_scores.keys():
|
||||
l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
|
||||
return l_decision_scores
|
||||
|
||||
def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
|
||||
"""
|
||||
Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
|
||||
decision scores (if otherwise). This space is here named zspace.
|
||||
:param lXtr: {lang:matrix} to train
|
||||
:param lYtr: {lang:labels} to train
|
||||
:param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
|
||||
:param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
|
||||
:return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
|
||||
models trained on lXtr, and the lYproj labels stacked consistently
|
||||
"""
|
||||
repair_empty_folds = True
|
||||
if lXproj is None and lYproj is None:
|
||||
lXproj, lYproj = lXtr, lYtr
|
||||
repair_empty_folds = False
|
||||
|
||||
print('fitting the projectors... {}'.format(lXtr.keys()))
|
||||
self.doc_projector.fit(lXtr, lYtr)
|
||||
|
||||
print('projecting the documents')
|
||||
langs = list(lXtr.keys())
|
||||
lZ = self._projection(self.doc_projector, lXproj)
|
||||
|
||||
# if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
|
||||
empty_categories = self.doc_projector.empty_categories
|
||||
lZ_bu = self._projection(self.doc_projector_bu, lXproj)
|
||||
|
||||
for lang in langs:
|
||||
repair = empty_categories[lang]
|
||||
lZ[lang][:,repair] = lZ_bu[lang][:,repair]
|
||||
|
||||
Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
|
||||
zy = np.vstack([lYproj[lang] for lang in langs])
|
||||
return Z, zy
|
||||
|
||||
def _get_zspace_folds(self, lX, ly):
|
||||
self.doc_projector_bu.fit(lX, ly)
|
||||
|
||||
print('split of {} folds'.format(self.folded_projections))
|
||||
skf = KFold(n_splits=self.folded_projections, shuffle=True)
|
||||
|
||||
Z, zy = [], []
|
||||
lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
|
||||
for fold in range(self.folded_projections):
|
||||
print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
|
||||
lfoldXtr, lfoldYtr = {}, {}
|
||||
lfoldXte, lfoldYte = {}, {}
|
||||
for lang in lX.keys():
|
||||
train, test = lfold[lang][fold]
|
||||
lfoldXtr[lang] = lX[lang][train]
|
||||
lfoldYtr[lang] = ly[lang][train]
|
||||
lfoldXte[lang] = lX[lang][test]
|
||||
lfoldYte[lang] = ly[lang][test]
|
||||
Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
|
||||
Z.append(Zfold)
|
||||
zy.append(zYfold)
|
||||
# compose the Z-space as the union of all folded predictions
|
||||
Z = np.vstack(Z)
|
||||
zy = np.vstack(zy)
|
||||
# refit the document projector with all examples to have a more reliable projector for test data
|
||||
self.doc_projector = self.doc_projector_bu
|
||||
return Z, zy
|
||||
|
||||
def fit(self, lX, ly, lZ=None, lzy=None):
|
||||
tinit = time.time()
|
||||
Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
|
||||
|
||||
#experimental: adds the posterior probabilities (computed outside) to the meta-classifier
|
||||
if lZ is not None and lzy is not None:
|
||||
zlangs = list(lZ.keys())
|
||||
Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
|
||||
zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(Z.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
|
||||
self.model.fit(Z, zy)
|
||||
self.time = time.time() - tinit
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, lX, lZ=None):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
|
||||
:return: a dictionary of predictions
|
||||
"""
|
||||
lZ_ = self._projection(self.doc_projector, lX)
|
||||
if lZ is not None:
|
||||
lZ_ = {**lZ_, **lZ}
|
||||
return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
|
||||
|
||||
def best_params(self):
|
||||
params = self.doc_projector.best_params()
|
||||
params['meta'] = self.model.best_params()
|
||||
return params
|
||||
# class FunnellingPolylingualClassifier:
|
||||
# """
|
||||
# This classifier projects each document d into a language-independent feature space where each dimension fi is the
|
||||
# decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
|
||||
# then trains one single classifier for all documents in this space, irrespective of their originary language
|
||||
# """
|
||||
# def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
|
||||
# calmode='cal', n_jobs=-1):
|
||||
# """
|
||||
# :param first_tier_learner: the learner used in the first-tier level
|
||||
# :param meta_learner: the learner used in the second-tier level
|
||||
# :param first_tier_parameters: parameters for the learner in the doc_projector
|
||||
# :param meta_parameters: parameters for the learner in the z-space
|
||||
# :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
|
||||
# :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
|
||||
# :param n_jobs: number of parallel threads
|
||||
# 'sigmoid' to use the sigmoid of the decision_function
|
||||
# projects the data before training the final classifier; if greater than one, the training set is split in as
|
||||
# many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
|
||||
# models trained on the remaining folds. This should increase the generality of the space to unseen data.
|
||||
# """
|
||||
# assert folded_projections>0, "positive number of folds expected"
|
||||
# assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
|
||||
# assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
|
||||
#
|
||||
# self.fist_tier_learner = first_tier_learner
|
||||
# self.meta_learner = meta_learner
|
||||
# self.fist_tier_parameters=first_tier_parameters
|
||||
# self.meta_parameters = meta_parameters
|
||||
# self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
|
||||
# self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
|
||||
# self.folded_projections = folded_projections
|
||||
# self.n_jobs = n_jobs
|
||||
# self.calmode = calmode
|
||||
#
|
||||
# def _projection(self, doc_projector, lX):
|
||||
# """
|
||||
# Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
|
||||
# decision_function if otherwise
|
||||
# :param doc_projector: the document projector (a NaivePolylingualClassifier)
|
||||
# :param lX: {lang:matrix} to train
|
||||
# :return: the projection, applied with predict_proba or decision_function
|
||||
# """
|
||||
# if self.calmode=='cal':
|
||||
# return doc_projector.predict_proba(lX)
|
||||
# else:
|
||||
# l_decision_scores = doc_projector.decision_function(lX)
|
||||
# if self.calmode=='sigmoid':
|
||||
# def sigmoid(x): return 1 / (1 + np.exp(-x))
|
||||
# for lang in l_decision_scores.keys():
|
||||
# l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
|
||||
# return l_decision_scores
|
||||
#
|
||||
# def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
|
||||
# """
|
||||
# Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
|
||||
# decision scores (if otherwise). This space is here named zspace.
|
||||
# :param lXtr: {lang:matrix} to train
|
||||
# :param lYtr: {lang:labels} to train
|
||||
# :param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
|
||||
# :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
|
||||
# :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
|
||||
# models trained on lXtr, and the lYproj labels stacked consistently
|
||||
# """
|
||||
# repair_empty_folds = True
|
||||
# if lXproj is None and lYproj is None:
|
||||
# lXproj, lYproj = lXtr, lYtr
|
||||
# repair_empty_folds = False
|
||||
#
|
||||
# print('fitting the projectors... {}'.format(lXtr.keys()))
|
||||
# self.doc_projector.fit(lXtr, lYtr)
|
||||
#
|
||||
# print('projecting the documents')
|
||||
# langs = list(lXtr.keys())
|
||||
# lZ = self._projection(self.doc_projector, lXproj)
|
||||
#
|
||||
# # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
|
||||
# empty_categories = self.doc_projector.empty_categories
|
||||
# lZ_bu = self._projection(self.doc_projector_bu, lXproj)
|
||||
#
|
||||
# for lang in langs:
|
||||
# repair = empty_categories[lang]
|
||||
# lZ[lang][:,repair] = lZ_bu[lang][:,repair]
|
||||
#
|
||||
# Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
|
||||
# zy = np.vstack([lYproj[lang] for lang in langs])
|
||||
# return Z, zy
|
||||
#
|
||||
# def _get_zspace_folds(self, lX, ly):
|
||||
# self.doc_projector_bu.fit(lX, ly)
|
||||
#
|
||||
# print('split of {} folds'.format(self.folded_projections))
|
||||
# skf = KFold(n_splits=self.folded_projections, shuffle=True)
|
||||
#
|
||||
# Z, zy = [], []
|
||||
# lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
|
||||
# for fold in range(self.folded_projections):
|
||||
# print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
|
||||
# lfoldXtr, lfoldYtr = {}, {}
|
||||
# lfoldXte, lfoldYte = {}, {}
|
||||
# for lang in lX.keys():
|
||||
# train, test = lfold[lang][fold]
|
||||
# lfoldXtr[lang] = lX[lang][train]
|
||||
# lfoldYtr[lang] = ly[lang][train]
|
||||
# lfoldXte[lang] = lX[lang][test]
|
||||
# lfoldYte[lang] = ly[lang][test]
|
||||
# Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
|
||||
# Z.append(Zfold)
|
||||
# zy.append(zYfold)
|
||||
# # compose the Z-space as the union of all folded predictions
|
||||
# Z = np.vstack(Z)
|
||||
# zy = np.vstack(zy)
|
||||
# # refit the document projector with all examples to have a more reliable projector for test data
|
||||
# self.doc_projector = self.doc_projector_bu
|
||||
# return Z, zy
|
||||
#
|
||||
# def fit(self, lX, ly, lZ=None, lzy=None):
|
||||
# tinit = time.time()
|
||||
# Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
|
||||
#
|
||||
# #experimental: adds the posterior probabilities (computed outside) to the meta-classifier
|
||||
# if lZ is not None and lzy is not None:
|
||||
# zlangs = list(lZ.keys())
|
||||
# Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
|
||||
# zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
|
||||
#
|
||||
# print('fitting the Z-space of shape={}'.format(Z.shape))
|
||||
# self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
|
||||
# self.model.fit(Z, zy)
|
||||
# self.time = time.time() - tinit
|
||||
#
|
||||
# return self
|
||||
#
|
||||
# def predict(self, lX, lZ=None):
|
||||
# """
|
||||
# :param lX: a dictionary {language_label: X csr-matrix}
|
||||
# :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
|
||||
# :return: a dictionary of predictions
|
||||
# """
|
||||
# lZ_ = self._projection(self.doc_projector, lX)
|
||||
# if lZ is not None:
|
||||
# lZ_ = {**lZ_, **lZ}
|
||||
# return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
|
||||
#
|
||||
# def best_params(self):
|
||||
# params = self.doc_projector.best_params()
|
||||
# params['meta'] = self.model.best_params()
|
||||
# return params
|
||||
|
||||
|
||||
class NaivePolylingualClassifier:
|
||||
|
|
@ -323,410 +323,3 @@ class MonolingualClassifier:
|
|||
|
||||
def best_params(self):
|
||||
return self.best_params_
|
||||
|
||||
|
||||
class FunnellingMultimodal(FunnellingPolylingualClassifier):
|
||||
def __init__(self,
|
||||
we_path,
|
||||
config,
|
||||
first_tier_learner,
|
||||
meta_learner,
|
||||
first_tier_parameters=None,
|
||||
meta_parameters=None,
|
||||
folded_projections=1,
|
||||
calmode='cal',
|
||||
n_jobs=-1):
|
||||
|
||||
super().__init__(first_tier_learner,
|
||||
meta_learner,
|
||||
first_tier_parameters,
|
||||
meta_parameters,
|
||||
folded_projections,
|
||||
calmode,
|
||||
n_jobs)
|
||||
|
||||
self.pca_independent_space = PCA(n_components=50)
|
||||
self.we_path = we_path
|
||||
self.config = config
|
||||
self.lang_word2idx = dict()
|
||||
self.languages = []
|
||||
self.lang_tfidf = {}
|
||||
self.embedding_space = None
|
||||
self.model = None
|
||||
self.time = None
|
||||
self.best_components = 'not set' # if auto optimize pca, it will store the optimal number of components
|
||||
|
||||
def vectorize(self, lX, prediction=False):
|
||||
langs = list(lX.keys())
|
||||
print(f'# tfidf-vectorizing docs')
|
||||
if prediction:
|
||||
|
||||
for lang in langs:
|
||||
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
|
||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
return self
|
||||
|
||||
for lang in langs:
|
||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
|
||||
self.languages.append(lang)
|
||||
tfidf_vectorizer.fit(lX[lang])
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
||||
self.lang_tfidf[lang] = tfidf_vectorizer
|
||||
return self
|
||||
|
||||
def _get_zspace(self, lXtr, lYtr):
|
||||
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
|
||||
self.doc_projector.fit(lXtr, lYtr)
|
||||
|
||||
print('\nprojecting the documents')
|
||||
lZ = self._projection(self.doc_projector, lXtr)
|
||||
|
||||
return lZ, lYtr
|
||||
|
||||
def fit(self, lX, ly):
|
||||
tinit = time.time()
|
||||
print('Vectorizing documents...')
|
||||
self.vectorize(lX)
|
||||
|
||||
for lang in self.languages:
|
||||
print(f'{lang}->{lX[lang].shape}')
|
||||
|
||||
Z, zy = self._get_zspace(lX, ly)
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
|
||||
_embedding_space = self.embedding_space.transform(self.config, lX)
|
||||
if self.config['max_label_space'] == 0:
|
||||
_cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1]
|
||||
if _cum_dimension - 300 > 0:
|
||||
_temp = _cum_dimension - 300
|
||||
else:
|
||||
_temp = _cum_dimension
|
||||
self.best_components = _temp
|
||||
# h_stacking posterior probabilities with (U) and/or (S) matrices
|
||||
for lang in self.languages:
|
||||
Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
|
||||
|
||||
# stacking Z space vertically
|
||||
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
||||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||
|
||||
self.standardizer = StandardizeTransformer()
|
||||
_vertical_Z = self.standardizer.fit_transform(_vertical_Z)
|
||||
|
||||
# todo testing ...
|
||||
# if self.config['post_pca']:
|
||||
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
|
||||
# self.pca_independent_space.fit(_vertical_Z)
|
||||
# _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
||||
n_jobs=self.n_jobs)
|
||||
self.model.fit(_vertical_Z, _vertical_Zy)
|
||||
self.time = time.time() - tinit
|
||||
print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
|
||||
|
||||
def predict(self, lX, ly):
|
||||
print('Vectorizing documents')
|
||||
self.vectorize(lX, prediction=True)
|
||||
lZ = self._projection(self.doc_projector, lX)
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
_embedding_space = self.embedding_space.transform(self.config, lX)
|
||||
|
||||
for lang in lX.keys():
|
||||
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
|
||||
|
||||
for lang in lZ.keys():
|
||||
print(lZ[lang].shape)
|
||||
# todo testing
|
||||
lZ[lang] = self.standardizer.transform(lZ[lang])
|
||||
# if self.config['post_pca']:
|
||||
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
|
||||
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
|
||||
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
|
||||
|
||||
class PolylingualEmbeddingsClassifier:
|
||||
"""
|
||||
This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article
|
||||
@article{conneau2017word,
|
||||
title={Word translation without parallel data},
|
||||
author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}},
|
||||
journal={arXiv preprint arXiv:1710.04087},
|
||||
year={2017}
|
||||
}
|
||||
url: https://github.com/facebookresearch/MUSE
|
||||
"""
|
||||
def __init__(self, wordembeddings_path, config, learner, c_parameters=None, n_jobs=-1):
|
||||
"""
|
||||
:param wordembeddings_path: the path to the directory containing the polylingual embeddings
|
||||
:param learner: the learner
|
||||
:param c_parameters: parameters for learner
|
||||
:param n_jobs: the number of concurrent threads
|
||||
"""
|
||||
self.wordembeddings_path = wordembeddings_path
|
||||
self.config = config
|
||||
self.learner = learner
|
||||
self.c_parameters=c_parameters
|
||||
self.n_jobs = n_jobs
|
||||
self.lang_tfidf = {}
|
||||
self.model = None
|
||||
self.languages = []
|
||||
self.lang_word2idx = dict()
|
||||
self.embedding_space = None
|
||||
|
||||
def fit_vectorizers(self, lX):
|
||||
for lang in lX.keys():
|
||||
if lang not in self.lang_tfidf:
|
||||
tfidf = TfidfVectorizer(sublinear_tf=True, use_idf=True) # text is already processed
|
||||
docs = lX[lang]
|
||||
tfidf.fit(docs)
|
||||
self.lang_tfidf[lang] = tfidf
|
||||
|
||||
|
||||
def vectorize(self, lX, prediction=False):
|
||||
langs = list(lX.keys())
|
||||
print(f'# tfidf-vectorizing docs')
|
||||
if prediction:
|
||||
|
||||
for lang in langs:
|
||||
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
|
||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
return self
|
||||
|
||||
for lang in langs:
|
||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
|
||||
self.languages.append(lang)
|
||||
tfidf_vectorizer.fit(lX[lang])
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
||||
self.lang_tfidf[lang] = tfidf_vectorizer
|
||||
return self
|
||||
|
||||
def embed(self, docs, lang):
|
||||
assert lang in self.lang_tfidf, 'unknown language'
|
||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
||||
V = tfidf_vectorizer.vocabulary_
|
||||
Xweights = tfidf_vectorizer.transform(docs)
|
||||
|
||||
print('loading word embeddings for ' + lang)
|
||||
we = WordEmbeddings.load(self.wordembeddings_path, lang)
|
||||
|
||||
nD = len(docs)
|
||||
doc_vecs = np.zeros((nD, we.dim()))
|
||||
|
||||
for i, doc in enumerate(docs):
|
||||
print('\r\tcomplete {:.3f}%'.format(100 * (i + 1) / nD), end='')
|
||||
# averaging with tfidf (summing each word only once, since the frequency is already controlled)
|
||||
for w in set(doc.split()):
|
||||
if w in we and w in V:
|
||||
doc_vecs[i] += (we[w] * Xweights[i, V[w]])
|
||||
# works much worse with idf; works much worse with document l2-normalization
|
||||
print()
|
||||
|
||||
return doc_vecs
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
||||
:param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
|
||||
:return: self
|
||||
"""
|
||||
tinit = time.time()
|
||||
langs = list(lX.keys())
|
||||
WEtr, Ytr = [], []
|
||||
# self.fit_vectorizers(lX) # if already fit, does nothing
|
||||
self.vectorize(lX)
|
||||
# config = {'unsupervised' : False, 'supervised': True}
|
||||
self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly)
|
||||
WEtr = self.embedding_space.transform(self.config, lX)
|
||||
# for lang in langs:
|
||||
# WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices
|
||||
# Ytr.append(ly[lang])
|
||||
|
||||
WEtr = np.vstack([WEtr[lang] for lang in langs])
|
||||
Ytr = np.vstack([ly[lang] for lang in langs])
|
||||
self.embed_time = time.time() - tinit
|
||||
|
||||
print('fitting the WE-space of shape={}'.format(WEtr.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs)
|
||||
self.model.fit(WEtr, Ytr)
|
||||
self.time = time.time() - tinit
|
||||
return self
|
||||
|
||||
def predict(self, lX, lY):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
||||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
self.vectorize(lX, prediction=True)
|
||||
langs = list(lX.keys())
|
||||
lWEte = self.embedding_space.transform(self.config, lX)
|
||||
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
||||
return _joblib_transform_multiling(self.model.transform, lWEte, n_jobs=self.n_jobs)
|
||||
|
||||
def predict_proba(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
||||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
langs = list(lX.keys())
|
||||
lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
||||
return _joblib_transform_multiling(self.model.predict_proba, lWEte, n_jobs=self.n_jobs)
|
||||
|
||||
def best_params(self):
|
||||
return self.model.best_params()
|
||||
|
||||
|
||||
class MonolingualNetSvm:
|
||||
"""
|
||||
testing: funnelling with NN managing word embeddings compositionality. An ensemble of n-SVMs (n equals to the
|
||||
number of training languages) is first fit on the data, generating the documents projection in the Z-space. Next,
|
||||
the projection are fed to a single NN with their respective document embeddings. The documents are projected into
|
||||
the embedding space while preserving their dimensionality (output dim is 300). These projection are horizonatally
|
||||
concatenated with the respective projection and passed through a fC layer with sigmoid act and output dim equal
|
||||
to the number of target classes.
|
||||
# TODO ATM testing with only 1 language
|
||||
"""
|
||||
def __init__(self, lX, ly, first_tier_learner, first_tier_parameters, n_jobs):
|
||||
self.lX = lX
|
||||
self.ly = ly
|
||||
# SVM Attributes
|
||||
self.doc_projector = NaivePolylingualClassifier(first_tier_learner, first_tier_parameters,
|
||||
n_jobs=n_jobs)
|
||||
self.calmode = 'cal'
|
||||
self.languages = []
|
||||
self.lang_word2idx = dict()
|
||||
self.lang_tfidf = {}
|
||||
self.base_learner = 'TODO'
|
||||
self.parameters = 'TODO'
|
||||
# NN Attributes
|
||||
self.NN = 'TODO'
|
||||
|
||||
|
||||
def load_preprocessed(self):
|
||||
"""
|
||||
in order to speed up the process, documents are first tokenized in the "main". Here, tokenized docs, word_index, and
|
||||
targets are loaded.
|
||||
:return: dict[lang] = (word_index, tokenized_docs, targets)
|
||||
"""
|
||||
import pickle
|
||||
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
def _build_embedding_matrix(self, lang, word_index):
|
||||
"""
|
||||
build embedding matrix by filtering out OOV embeddings
|
||||
:param lang:
|
||||
:param word_index:
|
||||
:return: filtered embedding matrix
|
||||
"""
|
||||
from embeddings.embeddings import EmbeddingsAligned
|
||||
type = 'MUSE'
|
||||
path = '/home/andreapdr/CLESA/'
|
||||
MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
|
||||
return MUSE
|
||||
|
||||
def get_data_and_embed(self, data_dict):
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
|
||||
langs = data_dict.keys()
|
||||
lang_embedding_matrix = dict()
|
||||
nn_lXtr = dict()
|
||||
nn_lytr = dict()
|
||||
|
||||
for lang in langs:
|
||||
lang_embedding_matrix[lang] = self._build_embedding_matrix(lang, data_dict[lang][0])
|
||||
nn_lXtr[lang] = pad_sequences(data_dict[lang][1], 100, padding='post')
|
||||
nn_lytr[lang] = [data_dict[lang][2]]
|
||||
|
||||
return nn_lXtr, nn_lytr, lang_embedding_matrix
|
||||
|
||||
def svm_vectorize(self, lX, prediction=False):
|
||||
langs = list(lX.keys())
|
||||
print(f'# tfidf-vectorizing docs')
|
||||
if prediction:
|
||||
for lang in langs:
|
||||
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
|
||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
return self
|
||||
for lang in langs:
|
||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
|
||||
self.languages.append(lang)
|
||||
tfidf_vectorizer.fit(lX[lang])
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
||||
self.lang_tfidf[lang] = tfidf_vectorizer
|
||||
return lX
|
||||
|
||||
def _get_zspace(self, lXtr, lYtr):
|
||||
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
|
||||
self.doc_projector.fit(lXtr, lYtr)
|
||||
|
||||
print('\nprojecting the documents')
|
||||
lZ = self._projection(self.doc_projector, lXtr)
|
||||
|
||||
return lZ, lYtr
|
||||
|
||||
def _projection(self, doc_projector, lX):
|
||||
"""
|
||||
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
|
||||
decision_function if otherwise
|
||||
:param doc_projector: the document projector (a NaivePolylingualClassifier)
|
||||
:param lX: {lang:matrix} to train
|
||||
:return: the projection, applied with predict_proba or decision_function
|
||||
"""
|
||||
if self.calmode=='cal':
|
||||
return doc_projector.predict_proba(lX)
|
||||
else:
|
||||
l_decision_scores = doc_projector.decision_function(lX)
|
||||
if self.calmode=='sigmoid':
|
||||
def sigmoid(x): return 1 / (1 + np.exp(-x))
|
||||
for lang in l_decision_scores.keys():
|
||||
l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
|
||||
return l_decision_scores
|
||||
|
||||
def fit(self):
|
||||
"""
|
||||
# 1. Fit SVM to generate posterior probabilities:
|
||||
# 1.1 Gather documents and vectorize them as in other SVM classifiers
|
||||
# 2. Fit NN
|
||||
# 2.1 Gather documents and build NN dataset by indexing wrt embedding matrix
|
||||
# 2.2 Fit NN first-layer to generate compositional doc embedding
|
||||
# 2.3 H-stack doc-embed and posterior P
|
||||
# 2.4 Feed stacked vector to output layer (sigmoid act): output Nc
|
||||
# 2.5 Train it...
|
||||
"""
|
||||
|
||||
# load pre-processed data
|
||||
data_dict = self.load_preprocessed()
|
||||
# build embedding matrices and neural network document training set
|
||||
nn_lXtr, nn_lytr, lang_embedding_matrix = self.get_data_and_embed(data_dict)
|
||||
# TF-IDF vectorzing documents for SVM classifier
|
||||
svm_lX = self.svm_vectorize(self.lX)
|
||||
|
||||
# just testing on a smaller subset of data
|
||||
test_svm_lX = dict()
|
||||
test_svm_ly = dict()
|
||||
test_svm_lX['it'] = svm_lX['it'][:10, :]
|
||||
test_svm_ly['it'] = self.ly['it'][:10, :]
|
||||
test_nn_data = nn_lXtr['it'][:10]
|
||||
|
||||
# projecting document into Z space by SVM
|
||||
svm_Z, _ = self._get_zspace(test_svm_lX, test_svm_ly)
|
||||
|
||||
# initializing net and forward pass
|
||||
net = CNN_pdr(73, 1, 300, len(lang_embedding_matrix['it'].vectors), 300, lang_embedding_matrix['it'].vectors)
|
||||
out = net.forward(test_nn_data, svm_Z['it'])
|
||||
|
||||
print('TODO')
|
||||
|
||||
def net(self):
|
||||
pass
|
||||
|
|
@ -10,7 +10,7 @@ import time
|
|||
from sklearn.decomposition import PCA
|
||||
from joblib import Parallel, delayed
|
||||
from scipy.sparse import issparse, vstack, hstack
|
||||
from transformers.StandardizeTransformer import StandardizeTransformer
|
||||
from util_transformers.StandardizeTransformer import StandardizeTransformer
|
||||
from util.SIF_embed import remove_pc
|
||||
from sklearn.preprocessing import normalize
|
||||
from sklearn.svm import SVC
|
||||
|
|
@ -127,22 +127,26 @@ class PosteriorProbabilitiesEmbedder:
|
|||
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents')
|
||||
return self.doc_projector.predict_proba(lX)
|
||||
|
||||
def _get_output_dim(self):
|
||||
return len(self.doc_projector.model['da'].model.classes_)
|
||||
|
||||
|
||||
class MuseEmbedder:
|
||||
|
||||
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight()):
|
||||
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False):
|
||||
self.path=path
|
||||
self.lV = lV
|
||||
self.l2 = l2
|
||||
self.n_jobs = n_jobs
|
||||
self.featureweight = featureweight
|
||||
self.sif = sif
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
assert lV is not None or self.lV is not None, 'lV not specified'
|
||||
self.langs = sorted(lX.keys())
|
||||
self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
|
||||
lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
|
||||
self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE}
|
||||
self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE.items()}
|
||||
self.featureweight.fit(lX, ly)
|
||||
return self
|
||||
|
||||
|
|
@ -150,7 +154,7 @@ class MuseEmbedder:
|
|||
MUSE = self.MUSE
|
||||
lX = self.featureweight.transform(lX)
|
||||
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], MUSE[lang]) for lang in self.langs
|
||||
delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs
|
||||
)
|
||||
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
|
||||
lMuse = _normalize(lMuse, self.l2)
|
||||
|
|
@ -162,14 +166,18 @@ class MuseEmbedder:
|
|||
def _get_wordlist_from_word2index(self, word2index):
|
||||
return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0]
|
||||
|
||||
def _get_output_dim(self):
|
||||
return self.MUSE['da'].shape[1]
|
||||
|
||||
|
||||
class WordClassEmbedder:
|
||||
|
||||
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight()):
|
||||
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False):
|
||||
self.n_jobs = n_jobs
|
||||
self.l2 = l2
|
||||
self.max_label_space=max_label_space
|
||||
self.featureweight = featureweight
|
||||
self.sif = sif
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
self.langs = sorted(lX.keys())
|
||||
|
|
@ -184,7 +192,7 @@ class WordClassEmbedder:
|
|||
lWCE = self.lWCE
|
||||
lX = self.featureweight.transform(lX)
|
||||
XdotWCE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], lWCE[lang])for lang in self.langs
|
||||
delayed(XdotM)(lX[lang], lWCE[lang], self.sif)for lang in self.langs
|
||||
)
|
||||
lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
|
||||
lwce = _normalize(lwce, self.l2)
|
||||
|
|
@ -193,6 +201,9 @@ class WordClassEmbedder:
|
|||
def fit_transform(self, lX, ly, lV=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def _get_output_dim(self):
|
||||
return 73
|
||||
|
||||
|
||||
class DocEmbedderList:
|
||||
|
||||
|
|
@ -201,6 +212,7 @@ class DocEmbedderList:
|
|||
if len(embedder_list)==0: embedder_list=[]
|
||||
self.embedders = embedder_list
|
||||
self.aggregation = aggregation
|
||||
print(f'Aggregation mode: {self.aggregation}')
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
for transformer in self.embedders:
|
||||
|
|
@ -238,16 +250,25 @@ class DocEmbedderList:
|
|||
langs = sorted(lX.keys())
|
||||
|
||||
lZparts = {l: None for l in langs}
|
||||
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
|
||||
min_dim = 300
|
||||
for transformer in self.embedders:
|
||||
lZ = transformer.transform(lX)
|
||||
nC = min([lZ[lang].shape[1] for lang in langs])
|
||||
for l in langs:
|
||||
Z = lZ[l]
|
||||
if Z.shape[1] > min_dim:
|
||||
print(f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
|
||||
f'Applying PCA(n_components={min_dim})')
|
||||
pca = PCA(n_components=min_dim)
|
||||
Z = pca.fit(Z).transform(Z)
|
||||
if lZparts[l] is None:
|
||||
lZparts[l] = Z
|
||||
else:
|
||||
lZparts[l] += Z
|
||||
|
||||
n_transformers = len(self.embedders)
|
||||
nC = min([lZparts[lang].shape[1] for lang in langs])
|
||||
|
||||
return {l:lZparts[l] / n_transformers for l in langs}
|
||||
|
||||
|
|
@ -266,7 +287,7 @@ class FeatureSet2Posteriors:
|
|||
self.transformer = transformer
|
||||
self.l2=l2
|
||||
self.n_jobs = n_jobs
|
||||
self.prob_classifier = MetaClassifier(SVC(kernel='rbf', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
||||
self.prob_classifier = MetaClassifier(SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
if lV is None and hasattr(self.transformer, 'lV'):
|
||||
|
|
@ -412,10 +433,12 @@ def word_class_embedding_matrix(X, Y, max_label_space=300):
|
|||
return WCE
|
||||
|
||||
|
||||
def XdotM(X,M):
|
||||
def XdotM(X,M, sif):
|
||||
# return X.dot(M)
|
||||
# print(f'X={X.shape}, M={M.shape}')
|
||||
print(f'X={X.shape}, M={M.shape}')
|
||||
E = X.dot(M)
|
||||
if sif:
|
||||
print("removing pc...")
|
||||
E = remove_pc(E, npc=1)
|
||||
return E
|
||||
|
||||
|
|
|
|||
|
|
@ -1,92 +0,0 @@
|
|||
from optparse import OptionParser
|
||||
from util.results import PolylingualClassificationResults
|
||||
from dataset_builder import MultilingualDataset
|
||||
from keras.preprocessing.text import Tokenizer
|
||||
from learning.learners import MonolingualNetSvm
|
||||
from sklearn.svm import SVC
|
||||
import pickle
|
||||
|
||||
parser = OptionParser()
|
||||
|
||||
parser.add_option("-d", "--dataset", dest="dataset",
|
||||
help="Path to the multilingual dataset processed and stored in .pickle format",
|
||||
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
||||
|
||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
||||
help="Optimize hyperparameters", default=False)
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
||||
(op, args) = parser.parse_args()
|
||||
|
||||
|
||||
###################################################################################################################
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
|
||||
|
||||
|
||||
def get_params(dense=False):
|
||||
if not op.optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = 'rbf' if dense else 'linear'
|
||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
||||
|
||||
# PREPROCESS TEXT AND SAVE IT ... both for SVM and NN
|
||||
def preprocess_data(lXtr, lXte, lytr, lyte):
|
||||
tokenized_tr = dict()
|
||||
tokenized_te = dict()
|
||||
for lang in lXtr.keys():
|
||||
alltexts = ' '.join(lXtr[lang])
|
||||
tokenizer = Tokenizer()
|
||||
tokenizer.fit_on_texts(alltexts.split(' '))
|
||||
tokenizer.oov_token = len(tokenizer.word_index)+1
|
||||
# dumping train set
|
||||
sequences_tr = tokenizer.texts_to_sequences(lXtr[lang])
|
||||
tokenized_tr[lang] = (tokenizer.word_index, sequences_tr, lytr[lang])
|
||||
# dumping test set
|
||||
sequences_te = tokenizer.texts_to_sequences(lXte[lang])
|
||||
tokenized_te[lang] = (tokenizer.word_index, sequences_te, lyte[lang])
|
||||
|
||||
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'wb') as f:
|
||||
pickle.dump(tokenized_tr, f)
|
||||
|
||||
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_test.pickle', 'wb') as f:
|
||||
pickle.dump(tokenized_tr, f)
|
||||
|
||||
print('Successfully dumped data')
|
||||
|
||||
# def load_preprocessed():
|
||||
# with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
|
||||
# return pickle.load(f)
|
||||
#
|
||||
# def build_embedding_matrix(lang, word_index):
|
||||
# type = 'MUSE'
|
||||
# path = '/home/andreapdr/CLESA/'
|
||||
# MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
|
||||
# return MUSE
|
||||
|
||||
|
||||
########## MAIN #################################################################################################
|
||||
|
||||
if __name__ == '__main__':
|
||||
results = PolylingualClassificationResults('./results/NN_FPEC_results.csv')
|
||||
data = MultilingualDataset.load(op.dataset)
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
if op.set_c != -1:
|
||||
meta_parameters = None
|
||||
else:
|
||||
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
|
||||
|
||||
test_architecture = MonolingualNetSvm(lXtr,
|
||||
lytr,
|
||||
first_tier_learner=get_learner(calibrate=True),
|
||||
first_tier_parameters=None,
|
||||
n_jobs=1)
|
||||
|
||||
test_architecture.fit()
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import argparse
|
||||
import torch.nn as nn
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
from torch.optim.lr_scheduler import StepLR, MultiStepLR
|
||||
from dataset_builder import MultilingualDataset
|
||||
from learning.transformers import load_muse_embeddings
|
||||
from models.lstm_class import RNNMultilingualClassifier
|
||||
|
|
@ -9,8 +9,6 @@ from util.early_stop import EarlyStopping
|
|||
from util.common import *
|
||||
from util.file import create_if_not_exist
|
||||
from time import time
|
||||
from embeddings.pretrained import *
|
||||
from os.path import join
|
||||
from tqdm import tqdm
|
||||
from util.evaluation import evaluate
|
||||
from util.file import get_file_name
|
||||
|
|
@ -100,7 +98,7 @@ def main():
|
|||
|
||||
# Loading the dataset
|
||||
data = MultilingualDataset.load(opt.dataset)
|
||||
# data.set_view(languages=['de', 'fr', 'sv', 'da', 'es', 'it'])
|
||||
data.set_view(languages=['de', 'fr']) #, 'it', 'en']) # 'sv', 'da', 'es', 'it'])
|
||||
data.show_dimensions()
|
||||
langs = data.langs()
|
||||
l_devel_raw, l_devel_target = data.training(target_as_csr=True)
|
||||
|
|
@ -108,6 +106,7 @@ def main():
|
|||
|
||||
# Loading the MUSE pretrained embeddings (only if requested)
|
||||
lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
|
||||
# lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set
|
||||
|
||||
# Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
|
||||
multilingual_index = MultilingualIndex()
|
||||
|
|
@ -115,10 +114,26 @@ def main():
|
|||
multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
|
||||
multilingual_index.embedding_matrices(lpretrained, opt.supervised)
|
||||
if opt.posteriors:
|
||||
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=opt.svm_max_docs)
|
||||
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000, store_posteriors=True) #stored_post=True) #opt.svm_max_docs)
|
||||
else:
|
||||
lPtr, lPva, lPte = None, None, None
|
||||
|
||||
# just_test = False
|
||||
# if just_test:
|
||||
#
|
||||
# model = torch.load(
|
||||
# '../checkpoint/rnn(H512)-Muse-WCE-Posteriors-(trainable)-jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')
|
||||
# criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||
#
|
||||
# # batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
|
||||
#
|
||||
# batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
|
||||
# l_test_index = multilingual_index.l_test_index()
|
||||
# epoch = 1
|
||||
# tinit = time()
|
||||
# test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
|
||||
# exit('Loaded')
|
||||
|
||||
# Model initialization
|
||||
model = init_Net(data.num_categories(), multilingual_index)
|
||||
|
||||
|
|
@ -130,7 +145,7 @@ def main():
|
|||
|
||||
tinit = time()
|
||||
create_if_not_exist(opt.checkpoint_dir)
|
||||
early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
|
||||
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
|
||||
|
||||
l_train_index, l_train_target = multilingual_index.l_train()
|
||||
l_val_index, l_val_target = multilingual_index.l_val()
|
||||
|
|
@ -155,7 +170,6 @@ def main():
|
|||
break
|
||||
|
||||
# training is over
|
||||
|
||||
# restores the best model according to the Mf1 of the validation set (only when plotmode==False)
|
||||
# stoptime = early_stop.stop_time - tinit
|
||||
# stopepoch = early_stop.best_epoch
|
||||
|
|
@ -164,6 +178,8 @@ def main():
|
|||
if opt.plotmode==False:
|
||||
print('-' * 80)
|
||||
print('Training over. Performing final evaluation')
|
||||
|
||||
# torch.cuda.empty_cache()
|
||||
model = early_stop.restore_checkpoint()
|
||||
|
||||
if opt.val_epochs>0:
|
||||
|
|
@ -183,10 +199,14 @@ def get_lr(optimizer):
|
|||
|
||||
|
||||
def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name):
|
||||
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
loss_history = []
|
||||
model.train()
|
||||
for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)):
|
||||
optim.zero_grad()
|
||||
_out = model(batch,post, lang)
|
||||
loss = criterion(model(batch, post, lang), target)
|
||||
loss.backward()
|
||||
clip_gradient(model)
|
||||
|
|
@ -195,7 +215,7 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,
|
|||
|
||||
if idx % opt.log_interval == 0:
|
||||
interval_loss = np.mean(loss_history[-opt.log_interval:])
|
||||
print(f'{opt.dataset} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
|
||||
print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
|
||||
|
||||
mean_loss = np.mean(interval_loss)
|
||||
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
|
||||
|
|
@ -203,6 +223,8 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,
|
|||
|
||||
|
||||
def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix):
|
||||
|
||||
loss_history = []
|
||||
model.eval()
|
||||
langs = sorted(ltest_index.keys())
|
||||
predictions = {l:[] for l in langs}
|
||||
|
|
@ -214,6 +236,7 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
|
|||
prediction = predict(logits)
|
||||
predictions[lang].append(prediction)
|
||||
yte_stacked[lang].append(target.detach().cpu().numpy())
|
||||
loss_history.append(loss)
|
||||
|
||||
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
|
||||
ly_ = {l:np.vstack(predictions[l]) for l in langs}
|
||||
|
|
@ -224,17 +247,15 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
|
|||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
if measure_prefix=='te':
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
||||
# (config['max_label_space'], classifier.best_components),
|
||||
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
||||
# lang, macrof1, microf1, macrok, microk, '')
|
||||
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
||||
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
||||
|
||||
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=tend)
|
||||
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mf1, timelapse=tend)
|
||||
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-accuracy', value=acc, timelapse=tend)
|
||||
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=loss, timelapse=tend)
|
||||
mean_loss = np.mean(loss_history)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
|
||||
|
||||
return Mf1
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
from dataset_builder import MultilingualDataset
|
||||
# from learning.learners import *
|
||||
from learning.learners import FunnellingMultimodal
|
||||
# from learning.learners import FunnellingMultimodal
|
||||
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
|
||||
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting
|
||||
from util.evaluation import *
|
||||
|
|
@ -14,14 +14,14 @@ from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
|||
|
||||
parser = OptionParser()
|
||||
|
||||
parser.add_option("-d", "--dataset", dest="dataset",
|
||||
help="Path to the multilingual dataset processed and stored in .pickle format",
|
||||
default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
||||
# parser.add_option("-d", "--dataset", dest="dataset",
|
||||
# help="Path to the multilingual dataset processed and stored in .pickle format",
|
||||
# default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='./results/results.csv')
|
||||
|
||||
parser.add_option("-P", "--probs", dest="probs", action='store_true',
|
||||
parser.add_option("-P", "--probs", dest="posteriors", action='store_true',
|
||||
help="Add posterior probabilities to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
|
||||
|
|
@ -46,6 +46,9 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
|||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
||||
default=300)
|
||||
|
||||
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
|
||||
help="Remove common component when computing dot product of word embedding matrices", default=False)
|
||||
|
||||
# parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
||||
# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
|
||||
# " If set to 0 it will automatically search for the best number of components", default=300)
|
||||
|
|
@ -72,15 +75,18 @@ def get_params(dense=False):
|
|||
if __name__ == '__main__':
|
||||
(op, args) = parser.parse_args()
|
||||
|
||||
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
|
||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||
assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
|
||||
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
|
||||
dataset = args[0]
|
||||
|
||||
dataset_file = os.path.basename(op.dataset)
|
||||
assert exists(dataset), 'Unable to find file '+str(dataset)
|
||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
|
||||
|
||||
dataset_file = os.path.basename(dataset)
|
||||
|
||||
results = PolylingualClassificationResults(op.output)
|
||||
|
||||
data = MultilingualDataset.load(op.dataset)
|
||||
data = MultilingualDataset.load(dataset)
|
||||
data.show_dimensions()
|
||||
|
||||
lXtr, lytr = data.training()
|
||||
|
|
@ -88,8 +94,9 @@ if __name__ == '__main__':
|
|||
|
||||
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||
|
||||
result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
|
||||
|
||||
# result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
|
||||
result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \
|
||||
f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}'
|
||||
print(f'{result_id}')
|
||||
|
||||
# text preprocessing
|
||||
|
|
@ -100,7 +107,7 @@ if __name__ == '__main__':
|
|||
lV = tfidfvectorizer.vocabulary()
|
||||
|
||||
classifiers = []
|
||||
if op.probs:
|
||||
if op.posteriors:
|
||||
classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
|
||||
if op.supervised:
|
||||
classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S)))
|
||||
|
|
@ -115,13 +122,37 @@ if __name__ == '__main__':
|
|||
print('\n# Evaluating ...')
|
||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||
|
||||
# renaming arguments to be printed on log
|
||||
_id = ''
|
||||
_id_conf = [op.posteriors, op.supervised, op.pretrained]
|
||||
_id_name = ['+P', '+W', '+M']
|
||||
for i, conf in enumerate(_id_conf):
|
||||
if conf:
|
||||
_id += _id_name[i]
|
||||
_id = _id.lstrip('+')
|
||||
_dataset_path = dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
metrics = []
|
||||
for lang in lXte.keys():
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
||||
# (config['max_label_space'], classifier.best_components),
|
||||
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
||||
# lang, macrof1, microf1, macrok, microk, '')
|
||||
results.add_row(method='Voting',
|
||||
learner='svm',
|
||||
optimp=op.optimc,
|
||||
sif=op.sif,
|
||||
zscore='todo',
|
||||
l2='todo',
|
||||
wescaler='todo',
|
||||
pca=op.max_labels_S,
|
||||
id=_id,
|
||||
dataset=dataset_id,
|
||||
time='todo',
|
||||
lang=lang,
|
||||
macrof1=macrof1,
|
||||
microf1=microf1,
|
||||
macrok=macrok,
|
||||
microk=microk,
|
||||
notes='')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ from sklearn.svm import SVC
|
|||
parser = OptionParser(usage="usage: %prog datapath [options]")
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='./results/results.csv')
|
||||
help="Result file", type=str, default='multiModal_log.csv')
|
||||
|
||||
parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true',
|
||||
help="Add posterior probabilities to the document embedding representation", default=False)
|
||||
|
|
@ -22,8 +22,8 @@ parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
|
|||
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
|
||||
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("--nol2", dest="nol2", action='store_true',
|
||||
help="Deactivates l2 normalization as a post-processing for the document embedding views", default=False)
|
||||
parser.add_option("--l2", dest="l2", action='store_true',
|
||||
help="Activates l2 normalization as a post-processing for the document embedding views", default=False)
|
||||
|
||||
parser.add_option("--allprob", dest="allprob", action='store_true',
|
||||
help="All views are generated as posterior probabilities. This affects the supervised and pretrained "
|
||||
|
|
@ -48,11 +48,28 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
|||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
||||
default=300)
|
||||
|
||||
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
|
||||
help="Remove common component when computing dot product of word embedding matrices", default=False)
|
||||
|
||||
parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
|
||||
help="Z-score normalize matrices (WCE and MUSE)", default=False)
|
||||
|
||||
parser.add_option("-a", "--agg", dest="agg", action='store_true',
|
||||
help="Set aggregation function of the common Z-space to average (Default: concatenation)", default=False)
|
||||
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
|
||||
|
||||
def get_params():
|
||||
if not op.optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = 'rbf'
|
||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
||||
|
||||
#######################################################################################################################
|
||||
|
||||
|
||||
|
|
@ -64,17 +81,23 @@ if __name__ == '__main__':
|
|||
assert exists(dataset), 'Unable to find file '+str(dataset)
|
||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
|
||||
l2=(op.nol2==False)
|
||||
l2=op.l2
|
||||
|
||||
dataset_file = os.path.basename(dataset)
|
||||
|
||||
results = PolylingualClassificationResults(op.output)
|
||||
results = PolylingualClassificationResults('../log/' + op.output)
|
||||
allprob='Prob' if op.allprob else ''
|
||||
result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \
|
||||
f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}{"_optimC" if op.optimc else ""}'
|
||||
f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}_zscore={op.zscore}{"_optimC" if op.optimc else ""}'
|
||||
print(f'{result_id}')
|
||||
|
||||
# set zscore range - is slice(0,0) mean will be equal to 0 and std to 1, thus normalization will have no effect
|
||||
standardize_range = slice(0,0)
|
||||
if op.zscore:
|
||||
standardize_range = None
|
||||
|
||||
data = MultilingualDataset.load(dataset)
|
||||
# data.set_view(languages=['fr', 'it'])
|
||||
data.show_dimensions()
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
|
@ -86,23 +109,23 @@ if __name__ == '__main__':
|
|||
feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
|
||||
|
||||
# # document embedding modules
|
||||
doc_embedder = DocEmbedderList(aggregation='concat')
|
||||
doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat')
|
||||
if op.posteriors:
|
||||
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2))
|
||||
if op.supervised:
|
||||
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting)
|
||||
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||
if op.allprob:
|
||||
wce = FeatureSet2Posteriors(wce, l2=l2)
|
||||
doc_embedder.append(wce)
|
||||
if op.pretrained:
|
||||
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting)
|
||||
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||
if op.allprob:
|
||||
muse = FeatureSet2Posteriors(muse, l2=l2)
|
||||
doc_embedder.append(muse)
|
||||
|
||||
# metaclassifier
|
||||
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=meta_parameters)
|
||||
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=get_params(), standardize_range=standardize_range)
|
||||
|
||||
# ensembling the modules
|
||||
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
|
||||
|
|
@ -113,13 +136,40 @@ if __name__ == '__main__':
|
|||
print('\n# Evaluating ...')
|
||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||
|
||||
# renaming arguments to be printed on log
|
||||
_id = ''
|
||||
_id_conf = [op.posteriors, op.supervised, op.pretrained]
|
||||
_id_name = ['+P', '+W', '+M']
|
||||
for i, conf in enumerate(_id_conf):
|
||||
if conf:
|
||||
_id += _id_name[i]
|
||||
_id = _id.lstrip('+')
|
||||
_id = _id if not op.agg else _id + '_mean'
|
||||
_id = _id if not op.allprob else _id + '_allprob'
|
||||
|
||||
_dataset_path = dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
metrics = []
|
||||
for lang in lXte.keys():
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
||||
# (config['max_label_space'], classifier.best_components),
|
||||
# config['dim_reduction_unsupervised'], op.optimc, dataset.split('/')[-1], classifier.time,
|
||||
# lang, macrof1, microf1, macrok, microk, '')
|
||||
results.add_row(method='MultiModal',
|
||||
learner='svm',
|
||||
optimp=op.optimc,
|
||||
sif= op.sif,
|
||||
zscore=op.zscore,
|
||||
l2= op.l2,
|
||||
wescaler= op.feat_weight,
|
||||
pca=op.max_labels_S,
|
||||
id=_id,
|
||||
dataset=dataset_id,
|
||||
time='todo',
|
||||
lang=lang,
|
||||
macrof1=macrof1,
|
||||
microf1=microf1,
|
||||
macrok=macrok,
|
||||
microk=microk,
|
||||
notes='')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ class RNNMultilingualClassifier(nn.Module):
|
|||
self.n_layers = 1
|
||||
self.n_directions = 1
|
||||
|
||||
self.dropout = nn.Dropout(0.2)
|
||||
self.dropout = nn.Dropout(0.6)
|
||||
|
||||
lstm_out = 256
|
||||
ff1 = 512
|
||||
|
|
@ -45,7 +45,7 @@ class RNNMultilingualClassifier(nn.Module):
|
|||
llearnable_embeddings[l] = learnable_embeddings
|
||||
self.embedding_length = embedding_length
|
||||
|
||||
# self.rnn = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
|
||||
# self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
|
||||
self.rnn = nn.GRU(self.embedding_length, hidden_size)
|
||||
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
|
||||
self.lpretrained_embeddings.update(lpretrained_embeddings)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,355 @@
|
|||
"""
|
||||
Test with smaller subset of languages.
|
||||
|
||||
1. Load doc (RCV1/2)
|
||||
2. Tokenize texts via bertTokenizer (I should already have these dumps)
|
||||
3. Construct better Dataloader/Datasets. NB: I need to keep track of the languages only for
|
||||
the testing phase (but who cares actually? If I have to do it for the testing phase, I think
|
||||
it is better to deploy it also in the training phase...)
|
||||
4. ...
|
||||
5. I have to understand if the pooled hidden state of the last layer is way worse than its averaged
|
||||
version (However, in BertForSeqClassification I guess that the pooled version is passed through
|
||||
the output linear layer in order to get the prediction scores?)
|
||||
6. At the same time, I have to build also an end-to-end model in order to fine-tune it. The previous step
|
||||
would be useful when deploying mBert as a View Generator. (Refactor gFun code with view generators?)
|
||||
7. ...
|
||||
8. Profits
|
||||
|
||||
"""
|
||||
from dataset_builder import MultilingualDataset
|
||||
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
import numpy as np
|
||||
import torch
|
||||
from util.common import clip_gradient, predict
|
||||
from time import time
|
||||
from util.csv_log import CSVLog
|
||||
from util.evaluation import evaluate
|
||||
from util.early_stop import EarlyStopping
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
from sklearn.model_selection import train_test_split
|
||||
import argparse
|
||||
|
||||
|
||||
def get_model(n_out):
|
||||
print('# Initializing model ...')
|
||||
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
|
||||
return model
|
||||
|
||||
def set_method_name():
|
||||
return 'mBERT'
|
||||
|
||||
def init_optimizer(model, lr):
|
||||
# return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay)
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in model.named_parameters()
|
||||
if not any(nd in n for nd in no_decay)],
|
||||
'weight_decay': opt.weight_decay},
|
||||
{'params': [p for n, p in model.named_parameters()
|
||||
if any(nd in n for nd in no_decay)],
|
||||
'weight_decay': opt.weight_decay}
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
|
||||
return optimizer
|
||||
|
||||
def init_logfile(method_name, opt):
|
||||
logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
|
||||
logfile.set_default('dataset', opt.dataset)
|
||||
logfile.set_default('run', opt.seed)
|
||||
logfile.set_default('method', method_name)
|
||||
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated'
|
||||
return logfile
|
||||
|
||||
def get_lr(optimizer):
|
||||
for param_group in optimizer.param_groups:
|
||||
return param_group['lr']
|
||||
|
||||
def get_dataset_name(datapath):
|
||||
possible_splits = [str(i) for i in range(10)]
|
||||
splitted = datapath.split('_')
|
||||
id_split = splitted[-1].split('.')[0][-1]
|
||||
if id_split in possible_splits:
|
||||
dataset_name = splitted[0].split('/')[-1]
|
||||
return f'{dataset_name}_run{id_split}'
|
||||
|
||||
def load_datasets(datapath):
|
||||
data = MultilingualDataset.load(datapath)
|
||||
data.set_view(languages=['nl']) # Testing with just two langs
|
||||
data.show_dimensions()
|
||||
|
||||
l_devel_raw, l_devel_target = data.training(target_as_csr=False)
|
||||
l_test_raw, l_test_target = data.test(target_as_csr=False)
|
||||
|
||||
return l_devel_raw, l_devel_target, l_test_raw, l_test_target
|
||||
|
||||
|
||||
def do_tokenization(l_dataset, max_len=512):
|
||||
print('# Starting Tokenization ...')
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
||||
langs = l_dataset.keys()
|
||||
l_tokenized = {}
|
||||
for lang in langs:
|
||||
l_tokenized[lang] = tokenizer(l_dataset[lang],
|
||||
truncation=True,
|
||||
max_length=max_len,
|
||||
add_special_tokens=True,
|
||||
padding='max_length')
|
||||
return l_tokenized
|
||||
|
||||
|
||||
class TrainingDataset(Dataset):
|
||||
"""
|
||||
data: dict of lang specific tokenized data
|
||||
labels: dict of lang specific targets
|
||||
"""
|
||||
def __init__(self, data, labels):
|
||||
self.langs = data.keys()
|
||||
self.lang_ids = {lang:identifier for identifier, lang in enumerate(self.langs)}
|
||||
|
||||
for i, lang in enumerate(self.langs):
|
||||
# print(lang)
|
||||
_data = data[lang]['input_ids']
|
||||
_data = np.array(_data)
|
||||
_labels = labels[lang]
|
||||
_lang_value = np.full(len(_data), self.lang_ids[lang])
|
||||
|
||||
if i == 0:
|
||||
self.data = _data
|
||||
self.labels = _labels
|
||||
self.lang_index = _lang_value
|
||||
else:
|
||||
self.data = np.vstack((self.data, _data))
|
||||
self.labels = np.vstack((self.labels, _labels))
|
||||
self.lang_index = np.concatenate((self.lang_index, _lang_value))
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
x = self.data[idx]
|
||||
y = self.labels[idx]
|
||||
lang = self.lang_index[idx]
|
||||
|
||||
return x, torch.tensor(y, dtype=torch.float), lang
|
||||
# return x, y, lang
|
||||
|
||||
def get_lang_ids(self):
|
||||
return self.lang_ids
|
||||
|
||||
def freeze_encoder(model):
|
||||
for param in model.base_model.parameters():
|
||||
param.requires_grad = False
|
||||
return model
|
||||
|
||||
def check_param_grad_status(model):
|
||||
print('#'*50)
|
||||
print('Model paramater status')
|
||||
for name, child in model.named_children():
|
||||
trainable = False
|
||||
for param in child.parameters():
|
||||
if param.requires_grad:
|
||||
trainable = True
|
||||
if not trainable:
|
||||
print(f'{name} is frozen')
|
||||
else:
|
||||
print(f'{name} is not frozen')
|
||||
print('#'*50)
|
||||
|
||||
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile):
|
||||
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
# dataset_id = 'RCV1/2_run0_newBert'
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
loss_history = []
|
||||
model.train()
|
||||
|
||||
for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
|
||||
# optim.zero_grad()
|
||||
out = model(batch.cuda())
|
||||
loss = criterion(out[0], target.cuda())
|
||||
loss.backward()
|
||||
clip_gradient(model)
|
||||
optim.step()
|
||||
loss_history.append(loss.item())
|
||||
|
||||
if idx % opt.log_interval == 0:
|
||||
interval_loss = np.mean(loss_history[-opt.log_interval:])
|
||||
print(
|
||||
f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
|
||||
|
||||
mean_loss = np.mean(interval_loss)
|
||||
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
|
||||
return mean_loss
|
||||
|
||||
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix):
|
||||
print('# Validating model ...')
|
||||
loss_history = []
|
||||
model.eval()
|
||||
langs = lang_ids.keys()
|
||||
id_2_lang = {v:k for k,v in lang_ids.items()}
|
||||
predictions = {l: [] for l in langs}
|
||||
yte_stacked = {l: [] for l in langs}
|
||||
|
||||
for batch, target, lang_idx in test_dataloader:
|
||||
out = model(batch.cuda())
|
||||
logits = out[0]
|
||||
loss = criterion(logits, target.cuda()).item()
|
||||
prediction = predict(logits)
|
||||
loss_history.append(loss)
|
||||
|
||||
# Assigning prediction to dict in predictionS and yte_stacked according to lang_idx
|
||||
for i, pred in enumerate(prediction):
|
||||
lang_pred = id_2_lang[lang_idx.numpy()[i]]
|
||||
predictions[lang_pred].append(pred)
|
||||
yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
|
||||
|
||||
ly = {l: np.vstack(yte_stacked[l]) for l in langs}
|
||||
ly_ = {l: np.vstack(predictions[l]) for l in langs}
|
||||
l_eval = evaluate(ly, ly_)
|
||||
metrics = []
|
||||
for lang in langs:
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
if measure_prefix == 'te':
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
||||
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
||||
|
||||
mean_loss = np.mean(loss_history)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
|
||||
|
||||
return Mf1
|
||||
|
||||
def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
|
||||
l_split_va = l_tokenized_tr
|
||||
l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
|
||||
l_split_tr = l_tokenized_tr
|
||||
l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
|
||||
|
||||
for lang in l_tokenized_tr.keys():
|
||||
val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
|
||||
|
||||
l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[lang] = \
|
||||
train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, random_state=seed, shuffle=True)
|
||||
|
||||
return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
|
||||
|
||||
def main():
|
||||
print('Running main ...')
|
||||
|
||||
DATAPATH = opt.dataset
|
||||
method_name = set_method_name()
|
||||
logfile = init_logfile(method_name, opt)
|
||||
|
||||
l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
|
||||
l_tokenized_tr = do_tokenization(l_devel_raw, max_len=512)
|
||||
|
||||
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop=0.2, max_val=2000, seed=opt.seed)
|
||||
|
||||
l_tokenized_te = do_tokenization(l_test_raw, max_len=512)
|
||||
|
||||
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
|
||||
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
|
||||
te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
|
||||
|
||||
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
|
||||
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=False)
|
||||
te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False)
|
||||
|
||||
# Initializing model
|
||||
model = get_model(73)
|
||||
model = model.cuda()
|
||||
criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||
optim = init_optimizer(model, lr=opt.lr)
|
||||
# lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
|
||||
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
|
||||
checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_dataset_name(opt.dataset)}')
|
||||
# lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optim, num_warmup_steps= , num_training_steps=)
|
||||
# print(model)
|
||||
|
||||
# Freezing encoder
|
||||
# model = freeze_encoder(model)
|
||||
check_param_grad_status(model)
|
||||
|
||||
# Training loop
|
||||
tinit = time()
|
||||
lang_ids = va_dataset.lang_ids
|
||||
for epoch in range(1, opt.nepochs+1):
|
||||
print('# Start Training ...')
|
||||
train(model, tr_dataloader, epoch, criterion, optim, 'TestingBert', tinit, logfile)
|
||||
# lr_scheduler.step(epoch=None) # reduces the learning rate
|
||||
|
||||
# validation
|
||||
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
|
||||
early_stop(macrof1, epoch)
|
||||
if opt.test_each>0:
|
||||
if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
|
||||
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
|
||||
|
||||
if early_stop.STOP:
|
||||
print('[early-stop] STOP')
|
||||
if not opt.plotmode:
|
||||
break
|
||||
|
||||
if opt.plotmode==False:
|
||||
print('-' * 80)
|
||||
print('Training over. Performing final evaluation')
|
||||
|
||||
model = early_stop.restore_checkpoint()
|
||||
|
||||
if opt.val_epochs>0:
|
||||
print(f'running last {opt.val_epochs} training epochs on the validation set')
|
||||
for val_epoch in range(1, opt.val_epochs + 1):
|
||||
train(model, va_dataloader, epoch+val_epoch, criterion, optim, 'TestingBert', tinit, logfile)
|
||||
|
||||
# final test
|
||||
print('Training complete: testing')
|
||||
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
|
||||
|
||||
exit('Code Executed!')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model')
|
||||
|
||||
parser.add_argument('--dataset', type=str, default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
|
||||
metavar='datasetpath', help=f'path to the pickled dataset')
|
||||
parser.add_argument('--nepochs', type=int, default=200, metavar='int',
|
||||
help='number of epochs (default: 200)')
|
||||
parser.add_argument('--lr', type=float, default=2e-5, metavar='float',
|
||||
help='learning rate (default: 2e-5)')
|
||||
parser.add_argument('--weight_decay', type=float, default=0, metavar='float',
|
||||
help='weight decay (default: 0)')
|
||||
parser.add_argument('--patience', type=int, default=10, metavar='int',
|
||||
help='patience for early-stop (default: 10)')
|
||||
parser.add_argument('--log-interval', type=int, default=20, metavar='int',
|
||||
help='how many batches to wait before printing training status')
|
||||
parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str',
|
||||
help='path to the log csv file')
|
||||
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
|
||||
parser.add_argument('--force', action='store_true', default=False,
|
||||
help='do not check if this experiment has already been run')
|
||||
parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str',
|
||||
help='path to the directory containing checkpoints')
|
||||
parser.add_argument('--plotmode', action='store_true', default=False,
|
||||
help='in plot mode executes a long run in order '
|
||||
'to generate enough data to produce trend plots (test-each should be >0. This mode is '
|
||||
'used to produce plots, and does not perform an evaluation on the test set.')
|
||||
parser.add_argument('--test-each', type=int, default=0, metavar='int',
|
||||
help='how many epochs to wait before invoking test (default: 0, only at the end)')
|
||||
parser.add_argument('--val-epochs', type=int, default=1, metavar='int',
|
||||
help='number of training epochs to perform on the validation set once training is over (default 1)')
|
||||
opt = parser.parse_args()
|
||||
|
||||
# Testing different parameters ...
|
||||
opt.weight_decay = 0.01
|
||||
opt.patience = 5
|
||||
|
||||
main()
|
||||
# TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size
|
||||
|
|
@ -1,7 +1,11 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t')
|
||||
pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'embed'], aggfunc=[np.mean, np.std])
|
||||
print(pivot)
|
||||
# df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t')
|
||||
df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t')
|
||||
pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std])
|
||||
with pd.option_context('display.max_rows', None):
|
||||
print(pivot.round(3))
|
||||
print('Finished ...')
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,11 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||
logfile=../log/log_Mbert_rcv.csv
|
||||
|
||||
runs='0 1 2 3 4 5 6 7 8 9'
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
python new_mbert.py --dataset $dataset --log-file $logfile --test-each 20
|
||||
done
|
||||
|
|
@ -17,7 +17,7 @@ def get_weighted_average(We, x, w):
|
|||
|
||||
def compute_pc(X,npc=1):
|
||||
"""
|
||||
Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
|
||||
Compute the principal components.
|
||||
:param X: X[i,:] is a data point
|
||||
:param npc: number of principal components to remove
|
||||
:return: component_[i,:] is the i-th pc
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import warnings
|
||||
import time
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
|
@ -143,6 +144,15 @@ class Index:
|
|||
|
||||
embedding_parts.append(F)
|
||||
|
||||
make_dumps = False
|
||||
if make_dumps:
|
||||
print(f'Dumping Embedding Matrices ...')
|
||||
import pickle
|
||||
with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile:
|
||||
pickle.dump((self.lang, embedding_parts, self.word2index), outfile)
|
||||
with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2:
|
||||
pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2)
|
||||
|
||||
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
|
||||
|
||||
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
|
||||
|
|
@ -155,6 +165,7 @@ class MultilingualIndex:
|
|||
def __init__(self): #, add_language_trace=False):
|
||||
self.l_index = {}
|
||||
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
# self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000)
|
||||
# self.add_language_trace=add_language_trace
|
||||
|
||||
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
|
||||
|
|
@ -189,10 +200,12 @@ class MultilingualIndex:
|
|||
# pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1)
|
||||
|
||||
|
||||
def posterior_probabilities(self, max_training_docs_by_lang=5000):
|
||||
def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
|
||||
# choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
|
||||
timeit = time.time()
|
||||
lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
|
||||
lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
|
||||
if not stored_post:
|
||||
for l in self.langs:
|
||||
n_elements = lXtr[l].shape[0]
|
||||
if n_elements > max_training_docs_by_lang:
|
||||
|
|
@ -211,8 +224,18 @@ class MultilingualIndex:
|
|||
lPtr = prob_embedder.transform(self.get_lXtr())
|
||||
lPva = prob_embedder.transform(self.get_lXva())
|
||||
lPte = prob_embedder.transform(self.get_lXte())
|
||||
|
||||
print('[posteriors] done')
|
||||
# NB: Check splits indices !
|
||||
if store_posteriors:
|
||||
import pickle
|
||||
with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
|
||||
pickle.dump([lPtr, lPva, lPte], outfile)
|
||||
print(f'Successfully dumped posteriors!')
|
||||
else:
|
||||
import pickle
|
||||
with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
|
||||
lPtr, lPva, lPte = pickle.load(infile)
|
||||
print(f'Successfully loaded stored posteriors!')
|
||||
print(f'[posteriors] done in {time.time() - timeit}')
|
||||
return lPtr, lPva, lPte
|
||||
|
||||
def get_lXtr(self):
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from util.file import create_if_not_exist
|
|||
|
||||
class EarlyStopping:
|
||||
|
||||
def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
|
||||
def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
|
||||
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
|
||||
self.patience_limit = patience
|
||||
self.patience = patience
|
||||
|
|
@ -16,9 +16,10 @@ class EarlyStopping:
|
|||
self.stop_time = None
|
||||
self.checkpoint = checkpoint
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.STOP = False
|
||||
|
||||
def __call__(self, watch_score, epoch):
|
||||
def __call__(self, watch_score, epoch): #model
|
||||
|
||||
if self.STOP: return #done
|
||||
|
||||
|
|
@ -29,6 +30,9 @@ class EarlyStopping:
|
|||
if self.checkpoint:
|
||||
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
|
||||
torch.save(self.model, self.checkpoint)
|
||||
# with open(self.checkpoint)
|
||||
# torch.save({'state_dict': self.model.state_dict(),
|
||||
# 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint)
|
||||
else:
|
||||
self.print(f'[early-stop] improved')
|
||||
self.patience = self.patience_limit
|
||||
|
|
@ -46,6 +50,7 @@ class EarlyStopping:
|
|||
self.patience=self.patience_limit
|
||||
|
||||
def restore_checkpoint(self):
|
||||
print(f'restoring best model from epoch {self.best_epoch}...')
|
||||
return torch.load(self.checkpoint)
|
||||
|
||||
def print(self, msg):
|
||||
|
|
|
|||
|
|
@ -5,8 +5,23 @@ import numpy as np
|
|||
class PolylingualClassificationResults:
|
||||
def __init__(self, file, autoflush=True, verbose=False):
|
||||
self.file = file
|
||||
self.columns = ['method', 'learner', 'embed', 'embed_type', 'pca_s', 'pca_u', 'optimp', 'dataset', 'time',
|
||||
'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
|
||||
self.columns = ['method',
|
||||
'learner',
|
||||
'optimp',
|
||||
'sif',
|
||||
'zscore',
|
||||
'l2',
|
||||
'wescaler',
|
||||
'pca',
|
||||
'id',
|
||||
'dataset',
|
||||
'time',
|
||||
'lang',
|
||||
'macrof1',
|
||||
'microf1',
|
||||
'macrok',
|
||||
'microk',
|
||||
'notes']
|
||||
self.autoflush = autoflush
|
||||
self.verbose = verbose
|
||||
if os.path.exists(file):
|
||||
|
|
@ -21,8 +36,8 @@ class PolylingualClassificationResults:
|
|||
def already_calculated(self, id):
|
||||
return (self.df['id'] == id).any()
|
||||
|
||||
def add_row(self, method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||
s = pd.Series([method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
||||
def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||
s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
||||
self.df = self.df.append(s, ignore_index=True)
|
||||
if self.autoflush: self.flush()
|
||||
self.tell(s.to_string())
|
||||
|
|
|
|||
Loading…
Reference in New Issue