baseline multilingual Bert
This commit is contained in:
parent
22b7ea7e66
commit
d1fdad5f6e
|
|
@ -1,10 +1,7 @@
|
||||||
import os
|
import os
|
||||||
import pickle
|
|
||||||
from torchtext.vocab import Vectors
|
from torchtext.vocab import Vectors
|
||||||
import torch
|
import torch
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from embeddings.supervised import get_supervised_embeddings
|
|
||||||
from util.decompositions import *
|
|
||||||
from util.SIF_embed import *
|
from util.SIF_embed import *
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -35,122 +32,10 @@ class PretrainedEmbeddings(ABC):
|
||||||
return source_idx, target_idx
|
return source_idx, target_idx
|
||||||
|
|
||||||
|
|
||||||
class WordEmbeddings:
|
|
||||||
|
|
||||||
def __init__(self, lang, we, worddim):
|
|
||||||
self.lang = lang
|
|
||||||
self.we = we
|
|
||||||
self.worddim = worddim
|
|
||||||
self.dimword = {v:k for k,v in self.worddim.items()}
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(cls, basedir, lang, word_preprocessor=None, dopickle=True):
|
|
||||||
filename = 'wiki.multi.{}.vec'.format(lang)
|
|
||||||
we_path = os.path.join(basedir, filename)
|
|
||||||
|
|
||||||
if dopickle and os.path.exists(we_path + '.pkl'):
|
|
||||||
print('loading pkl in {}'.format(we_path + '.pkl'))
|
|
||||||
(worddim, we) = pickle.load(open(we_path + '.pkl', 'rb'))
|
|
||||||
else:
|
|
||||||
word_registry = set()
|
|
||||||
lines = open(we_path).readlines()
|
|
||||||
nwords, dims = [int(x) for x in lines[0].split()]
|
|
||||||
print('reading we of {} dimensions'.format(dims))
|
|
||||||
we = np.zeros((nwords, dims), dtype=float)
|
|
||||||
worddim = {}
|
|
||||||
index = 0
|
|
||||||
for i, line in enumerate(lines[1:]):
|
|
||||||
if (i + 1) % 100 == 0:
|
|
||||||
print('\r{}/{}'.format(i + 1, len(lines)), end='')
|
|
||||||
word, *vals = line.split()
|
|
||||||
wordp = word_preprocessor(word) if word_preprocessor is not None else word
|
|
||||||
if wordp:
|
|
||||||
wordp = wordp[0]
|
|
||||||
if wordp in word_registry:
|
|
||||||
print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp))
|
|
||||||
elif len(vals) == dims:
|
|
||||||
worddim[wordp] = index
|
|
||||||
we[index, :] = np.array(vals).astype(float)
|
|
||||||
index += 1
|
|
||||||
# else:
|
|
||||||
# print('warning: word <{}> generates an empty string after preprocessing'.format(word))
|
|
||||||
we = we[:index]
|
|
||||||
print('load {} words'.format(index))
|
|
||||||
if dopickle:
|
|
||||||
print('saving...')
|
|
||||||
pickle.dump((worddim, we), open(we_path + '.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
|
||||||
|
|
||||||
return WordEmbeddings(lang, we, worddim)
|
|
||||||
|
|
||||||
def vocabulary(self):
|
|
||||||
return set(self.worddim.keys())
|
|
||||||
|
|
||||||
def __getitem__(self, key):
|
|
||||||
return self.we[self.worddim[key]]
|
|
||||||
|
|
||||||
def dim(self):
|
|
||||||
return self.we.shape[1]
|
|
||||||
|
|
||||||
def __contains__(self, key):
|
|
||||||
return key in self.worddim
|
|
||||||
|
|
||||||
def most_similar(self, word_vect, k):
|
|
||||||
if word_vect.ndim == 1:
|
|
||||||
word_vect = word_vect.reshape(1,-1)
|
|
||||||
assert word_vect.shape[1] == self.dim(), 'inconsistent dimensions'
|
|
||||||
|
|
||||||
sim = np.dot(word_vect,self.we.T)
|
|
||||||
order = np.argsort(-1*sim, axis=1)[:,:k]
|
|
||||||
|
|
||||||
similar_words = [[self.dimword[order[vi,ki]] for ki in range(k)] for vi in range(word_vect.shape[0])]
|
|
||||||
sim_scores = sim[:,order]
|
|
||||||
return similar_words, sim_scores
|
|
||||||
|
|
||||||
def get_vectors(self, wordlist):
|
|
||||||
indexes = np.array([self.worddim[w] for w in wordlist])
|
|
||||||
return self.we[indexes]
|
|
||||||
|
|
||||||
def restrict(self, vocabulary):
|
|
||||||
# vocabulary is a set of terms to be kept
|
|
||||||
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
|
|
||||||
lost = len(vocabulary)-len(active_vocabulary)
|
|
||||||
if lost > 0: # some terms are missing, so it will be replaced by UNK
|
|
||||||
print('warning: missing {} terms for lang {}'.format(lost, self.lang))
|
|
||||||
self.we = self.get_vectors(active_vocabulary)
|
|
||||||
assert self.we.shape[0] == len(active_vocabulary)
|
|
||||||
self.dimword={i:w for i,w in enumerate(active_vocabulary)}
|
|
||||||
self.worddim={w:i for i,w in enumerate(active_vocabulary)}
|
|
||||||
return self
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load_poly(cls, basedir, langs, lang_vocabularies=None, word_preprocessor=None):
|
|
||||||
if lang_vocabularies is None:
|
|
||||||
return cls.merge([cls.load(basedir,lang, word_preprocessor) for lang in langs])
|
|
||||||
else:
|
|
||||||
# assert all([l in lang_vocabularies for l in langs]), 'missing vocabulary for some languages'
|
|
||||||
return cls.merge([cls.load(basedir, lang, word_preprocessor).restrict(lang_vocabularies[lang]) for lang in langs])
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def merge(cls, we_list):
|
|
||||||
assert all([isinstance(we, WordEmbeddings) for we in we_list]), \
|
|
||||||
'instances of {} expected'.format(WordEmbeddings.__name__)
|
|
||||||
|
|
||||||
polywe = []
|
|
||||||
worddim = {}
|
|
||||||
offset = 0
|
|
||||||
for we in we_list:
|
|
||||||
polywe.append(we.we)
|
|
||||||
worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
|
|
||||||
offset = len(worddim)
|
|
||||||
polywe = np.vstack(polywe)
|
|
||||||
|
|
||||||
return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
|
|
||||||
|
|
||||||
|
|
||||||
class FastTextWikiNews(Vectors):
|
class FastTextWikiNews(Vectors):
|
||||||
|
|
||||||
url_base = 'Cant auto-download MUSE embeddings'
|
url_base = 'Cant auto-download MUSE embeddings'
|
||||||
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
|
path = '../embeddings/wiki.multi.{}.vec'
|
||||||
_name = '/wiki.multi.{}.vec'
|
_name = '/wiki.multi.{}.vec'
|
||||||
|
|
||||||
def __init__(self, cache, language="en", **kwargs):
|
def __init__(self, cache, language="en", **kwargs):
|
||||||
|
|
@ -159,42 +44,13 @@ class FastTextWikiNews(Vectors):
|
||||||
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class EmbeddingsAligned(Vectors):
|
|
||||||
|
|
||||||
def __init__(self, type, path, lang, voc):
|
|
||||||
# todo - rewrite as relative path
|
|
||||||
self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
|
|
||||||
self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
|
|
||||||
self.path = path + self.name.format(lang)
|
|
||||||
assert os.path.exists(path), f'pre-trained vectors not found in {path}'
|
|
||||||
super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
|
|
||||||
self.vectors = self.extract(voc)
|
|
||||||
|
|
||||||
def vocabulary(self):
|
|
||||||
return set(self.stoi.keys())
|
|
||||||
|
|
||||||
def extract(self, words):
|
|
||||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
|
|
||||||
extraction = torch.zeros((len(words), self.dim))
|
|
||||||
extraction[source_idx] = self.vectors[target_idx]
|
|
||||||
return extraction
|
|
||||||
|
|
||||||
def reduce(self, dim):
|
|
||||||
pca = PCA(n_components=dim)
|
|
||||||
self.vectors = pca.fit_transform(self.vectors)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
class FastTextMUSE(PretrainedEmbeddings):
|
class FastTextMUSE(PretrainedEmbeddings):
|
||||||
|
|
||||||
def __init__(self, path, lang, limit=None):
|
def __init__(self, path, lang, limit=None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
print(f'Loading fastText pretrained vectors for language {lang} from {path}')
|
print(f'Loading fastText pretrained vectors for language {lang} from {path}')
|
||||||
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
|
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
|
||||||
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
|
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
|
||||||
|
|
||||||
|
|
||||||
def vocabulary(self):
|
def vocabulary(self):
|
||||||
return set(self.embed.stoi.keys())
|
return set(self.embed.stoi.keys())
|
||||||
|
|
||||||
|
|
@ -204,114 +60,8 @@ class FastTextMUSE(PretrainedEmbeddings):
|
||||||
def extract(self, words):
|
def extract(self, words):
|
||||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
||||||
extraction = torch.zeros((len(words), self.dim()))
|
extraction = torch.zeros((len(words), self.dim()))
|
||||||
# extraction = torch.empty(len(words), self.dim()).normal_(0, 1)
|
|
||||||
extraction[source_idx] = self.embed.vectors[target_idx]
|
extraction[source_idx] = self.embed.vectors[target_idx]
|
||||||
return extraction
|
return extraction
|
||||||
|
|
||||||
|
|
||||||
class StorageEmbeddings:
|
|
||||||
def __init__(self, path):
|
|
||||||
self.path = path
|
|
||||||
self.lang_U = dict()
|
|
||||||
self.lang_S = dict()
|
|
||||||
|
|
||||||
def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
|
|
||||||
for lang in docs.keys():
|
|
||||||
print(f'# [unsupervised-matrix {type}] for {lang}')
|
|
||||||
voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
|
|
||||||
self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
|
|
||||||
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
|
|
||||||
nC = self.lang_U[lang].shape[1]
|
|
||||||
if max_label_space == 0:
|
|
||||||
print(f'Computing optimal number of PCA components along matrices U')
|
|
||||||
optimal_n = get_optimal_dim(self.lang_U, 'U')
|
|
||||||
self.lang_U = run_pca(optimal_n, self.lang_U)
|
|
||||||
elif max_label_space < nC:
|
|
||||||
print(f'Applying PCA to unsupervised matrix U')
|
|
||||||
self.lang_U = run_pca(max_label_space, self.lang_U)
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
|
|
||||||
only_well_represented_C = False # TODO testing
|
|
||||||
if only_well_represented_C:
|
|
||||||
labels = labels.copy()
|
|
||||||
min_prevalence = 0
|
|
||||||
print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...')
|
|
||||||
langs = list(docs.keys())
|
|
||||||
well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs])
|
|
||||||
for lang in langs:
|
|
||||||
labels[lang] = labels[lang][:, well_repr_cats]
|
|
||||||
print(f'Target number reduced to: {labels[lang].shape[1]}\n')
|
|
||||||
|
|
||||||
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
|
|
||||||
print(f'# [supervised-matrix] for {lang}')
|
|
||||||
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
|
|
||||||
reduction, max_label_space, voc[lang], lang)
|
|
||||||
nC = self.lang_S[lang].shape[1]
|
|
||||||
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
|
||||||
|
|
||||||
if max_label_space == 0: # looking for best n_components analyzing explained_variance_ratio
|
|
||||||
print(f'Computing optimal number of PCA components along matrices S')
|
|
||||||
optimal_n = get_optimal_dim(self.lang_S, 'S')
|
|
||||||
print(f'Applying PCA(n_components={optimal_n})')
|
|
||||||
self.lang_S = run_pca(optimal_n, self.lang_S)
|
|
||||||
elif max_label_space == -1: # applying pca to the verticals stacked matrix of WCE embeddings
|
|
||||||
print(f'Computing PCA on vertical stacked WCE embeddings')
|
|
||||||
languages = self.lang_S.keys()
|
|
||||||
_temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) # stacking WCE vertically
|
|
||||||
stacked_pca = PCA(n_components=_temp_stack.shape[1])
|
|
||||||
stacked_pca.fit(_temp_stack)
|
|
||||||
best_n = None
|
|
||||||
_r = stacked_pca.explained_variance_ratio_
|
|
||||||
_r = np.cumsum(_r)
|
|
||||||
plt.plot(_r, label='Stacked Supervised')
|
|
||||||
for i in range(len(_r) - 1, 1, -1):
|
|
||||||
delta = _r[i] - _r[i - 1]
|
|
||||||
if delta > 0:
|
|
||||||
best_n = i
|
|
||||||
break
|
|
||||||
plt.show()
|
|
||||||
stacked_pca = PCA(n_components=best_n)
|
|
||||||
stacked_pca.fit(_temp_stack)
|
|
||||||
print(f'Applying PCA(n_components={i}')
|
|
||||||
for lang in languages:
|
|
||||||
self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
|
|
||||||
elif max_label_space <= nC: # less or equal in order to reduce it to the same initial dimension
|
|
||||||
print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})')
|
|
||||||
self.lang_S = run_pca(max_label_space, self.lang_S)
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
def SIF_embeddings(self):
|
|
||||||
print('todo') # TODO
|
|
||||||
|
|
||||||
def _concatenate_embeddings(self, docs):
|
|
||||||
_r = dict()
|
|
||||||
for lang in self.lang_U.keys():
|
|
||||||
_r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang])))
|
|
||||||
return _r
|
|
||||||
|
|
||||||
def fit(self, config, docs, vocs, labels):
|
|
||||||
if config['unsupervised']:
|
|
||||||
self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
|
|
||||||
if config['supervised']:
|
|
||||||
self._add_embeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def predict(self, config, docs):
|
|
||||||
if config['supervised'] and config['unsupervised']:
|
|
||||||
return self._concatenate_embeddings(docs)
|
|
||||||
# todo testing applying pca to hstack muse + wce
|
|
||||||
# _reduced = self._concatenate_embeddings(docs)
|
|
||||||
# return run_pca(300, _reduced)
|
|
||||||
elif config['supervised']:
|
|
||||||
_r = dict()
|
|
||||||
for lang in docs.keys():
|
|
||||||
_r[lang] = docs[lang].dot(self.lang_S[lang])
|
|
||||||
else:
|
|
||||||
_r = dict()
|
|
||||||
for lang in docs.keys():
|
|
||||||
_r[lang] = docs[lang].dot(self.lang_U[lang])
|
|
||||||
|
|
||||||
return _r
|
|
||||||
|
|
|
||||||
|
|
@ -1,103 +1,102 @@
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
import torch, torchtext
|
import torch, torchtext
|
||||||
import gensim
|
# import gensim
|
||||||
import os
|
# import os
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
class KeyedVectors:
|
# class KeyedVectors:
|
||||||
|
#
|
||||||
def __init__(self, word2index, weights):
|
# def __init__(self, word2index, weights):
|
||||||
assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
|
# assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
|
||||||
index2word = {i:w for w,i in word2index.items()}
|
# index2word = {i:w for w,i in word2index.items()}
|
||||||
assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
|
# assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
|
||||||
self.word2index = word2index
|
# self.word2index = word2index
|
||||||
self.index2word = index2word
|
# self.index2word = index2word
|
||||||
self.weights = weights
|
# self.weights = weights
|
||||||
|
#
|
||||||
def extract(self, words):
|
# def extract(self, words):
|
||||||
dim = self.weights.shape[1]
|
# dim = self.weights.shape[1]
|
||||||
v_size = len(words)
|
# v_size = len(words)
|
||||||
|
#
|
||||||
source_idx, target_idx = [], []
|
# source_idx, target_idx = [], []
|
||||||
for i,word in enumerate(words):
|
# for i,word in enumerate(words):
|
||||||
if word not in self.word2index: continue
|
# if word not in self.word2index: continue
|
||||||
j = self.word2index[word]
|
# j = self.word2index[word]
|
||||||
source_idx.append(i)
|
# source_idx.append(i)
|
||||||
target_idx.append(j)
|
# target_idx.append(j)
|
||||||
|
#
|
||||||
extraction = np.zeros((v_size, dim))
|
# extraction = np.zeros((v_size, dim))
|
||||||
extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
|
# extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
|
||||||
|
#
|
||||||
return extraction
|
# return extraction
|
||||||
|
|
||||||
|
|
||||||
|
# class PretrainedEmbeddings(ABC):
|
||||||
class PretrainedEmbeddings(ABC):
|
#
|
||||||
|
# def __init__(self):
|
||||||
def __init__(self):
|
# super().__init__()
|
||||||
super().__init__()
|
#
|
||||||
|
# @abstractmethod
|
||||||
@abstractmethod
|
# def vocabulary(self): pass
|
||||||
def vocabulary(self): pass
|
#
|
||||||
|
# @abstractmethod
|
||||||
@abstractmethod
|
# def dim(self): pass
|
||||||
def dim(self): pass
|
#
|
||||||
|
# @classmethod
|
||||||
@classmethod
|
# def reindex(cls, words, word2index):
|
||||||
def reindex(cls, words, word2index):
|
# source_idx, target_idx = [], []
|
||||||
source_idx, target_idx = [], []
|
# for i, word in enumerate(words):
|
||||||
for i, word in enumerate(words):
|
# if word not in word2index: continue
|
||||||
if word not in word2index: continue
|
# j = word2index[word]
|
||||||
j = word2index[word]
|
# source_idx.append(i)
|
||||||
source_idx.append(i)
|
# target_idx.append(j)
|
||||||
target_idx.append(j)
|
# source_idx = np.asarray(source_idx)
|
||||||
source_idx = np.asarray(source_idx)
|
# target_idx = np.asarray(target_idx)
|
||||||
target_idx = np.asarray(target_idx)
|
# return source_idx, target_idx
|
||||||
return source_idx, target_idx
|
|
||||||
|
|
||||||
|
|
||||||
class GloVe(PretrainedEmbeddings):
|
# class GloVe(PretrainedEmbeddings):
|
||||||
|
#
|
||||||
def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
|
# def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
|
||||||
super().__init__()
|
# super().__init__()
|
||||||
print(f'Loading GloVe pretrained vectors from torchtext')
|
# print(f'Loading GloVe pretrained vectors from torchtext')
|
||||||
self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
|
# self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
|
||||||
print('Done')
|
# print('Done')
|
||||||
|
#
|
||||||
def vocabulary(self):
|
# def vocabulary(self):
|
||||||
return set(self.embed.stoi.keys())
|
# return set(self.embed.stoi.keys())
|
||||||
|
#
|
||||||
def dim(self):
|
# def dim(self):
|
||||||
return self.embed.dim
|
# return self.embed.dim
|
||||||
|
#
|
||||||
def extract(self, words):
|
# def extract(self, words):
|
||||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
||||||
extraction = torch.zeros((len(words), self.dim()))
|
# extraction = torch.zeros((len(words), self.dim()))
|
||||||
extraction[source_idx] = self.embed.vectors[target_idx]
|
# extraction[source_idx] = self.embed.vectors[target_idx]
|
||||||
return extraction
|
# return extraction
|
||||||
|
|
||||||
|
|
||||||
class Word2Vec(PretrainedEmbeddings):
|
# class Word2Vec(PretrainedEmbeddings):
|
||||||
|
#
|
||||||
def __init__(self, path, limit=None):
|
# def __init__(self, path, limit=None):
|
||||||
super().__init__()
|
# super().__init__()
|
||||||
print(f'Loading word2vec pretrained vectors from {path}')
|
# print(f'Loading word2vec pretrained vectors from {path}')
|
||||||
assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
|
# assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
|
||||||
self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
|
# self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
|
||||||
self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
|
# self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
|
||||||
print('Done')
|
# print('Done')
|
||||||
|
#
|
||||||
def vocabulary(self):
|
# def vocabulary(self):
|
||||||
return set(self.word2index.keys())
|
# return set(self.word2index.keys())
|
||||||
|
#
|
||||||
def dim(self):
|
# def dim(self):
|
||||||
return self.embed.vector_size
|
# return self.embed.vector_size
|
||||||
|
#
|
||||||
def extract(self, words):
|
# def extract(self, words):
|
||||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
|
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
|
||||||
extraction = np.zeros((len(words), self.dim()))
|
# extraction = np.zeros((len(words), self.dim()))
|
||||||
extraction[source_idx] = self.embed.vectors[target_idx]
|
# extraction[source_idx] = self.embed.vectors[target_idx]
|
||||||
extraction = torch.from_numpy(extraction).float()
|
# extraction = torch.from_numpy(extraction).float()
|
||||||
return extraction
|
# return extraction
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
|
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
|
||||||
import numpy as np
|
import numpy as np
|
||||||
# from sklearn.decomposition import PCA
|
|
||||||
# from sklearn.manifold import TSNE
|
|
||||||
|
|
||||||
|
|
||||||
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
||||||
|
|
@ -69,31 +67,6 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la
|
||||||
|
|
||||||
return F
|
return F
|
||||||
|
|
||||||
# if nC >= max_label_space:
|
|
||||||
# if reduction == 'PCA':
|
|
||||||
# if max_label_space == 0:
|
|
||||||
# pca = PCA(n_components=Y.shape[1])
|
|
||||||
# pca = pca.fit(F)
|
|
||||||
# return pca.explained_variance_ratio_
|
|
||||||
#
|
|
||||||
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
|
||||||
# f'Applying PCA(n_components={max_label_space})')
|
|
||||||
# pca = PCA(n_components=max_label_space)
|
|
||||||
# F = pca.fit_transform(F)
|
|
||||||
# elif reduction == 'TSNE':
|
|
||||||
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
|
||||||
# f'Applying t-SNE(n_components={max_label_space})')
|
|
||||||
# tsne = TSNE(n_components=max_label_space)
|
|
||||||
# F = tsne.fit_transform(F)
|
|
||||||
# elif reduction == 'tSVD':
|
|
||||||
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
|
||||||
# f'Applying truncatedSVD(n_components={max_label_space})')
|
|
||||||
# tSVD = TruncatedSVD(n_components=max_label_space)
|
|
||||||
# F = tSVD.fit_transform(F)
|
|
||||||
#
|
|
||||||
# return F
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,11 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
|
||||||
|
logfile=../log/log10run_dl_jrc.csv
|
||||||
|
|
||||||
|
runs='0 1 2 3 4 5 6 7 8 9'
|
||||||
|
for run in $runs
|
||||||
|
do
|
||||||
|
dataset=$dataset_path$run.pickle
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
|
||||||
|
done
|
||||||
|
|
@ -0,0 +1,11 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||||
|
logfile=../log/log10run_dl_rcv.csv
|
||||||
|
|
||||||
|
runs='0 1 2 3 4 5 6 7 8 9'
|
||||||
|
for run in $runs
|
||||||
|
do
|
||||||
|
dataset=$dataset_path$run.pickle
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
|
||||||
|
done
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
|
||||||
|
logfile=./results/10run_jrc_final_results.csv
|
||||||
|
|
||||||
|
runs='0 1 2 3 4 5 6 7 8 9'
|
||||||
|
for run in $runs
|
||||||
|
do
|
||||||
|
dataset=$dataset_path$run.pickle
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
|
||||||
|
|
||||||
|
done
|
||||||
|
|
@ -0,0 +1,16 @@
|
||||||
|
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
|
||||||
|
logfile=./results/funnelling_10run_jrc_CIKM.csv
|
||||||
|
|
||||||
|
runs='6 7 8 9' #0 1 2 3 4 5
|
||||||
|
for run in $runs
|
||||||
|
do
|
||||||
|
dataset=$dataset_path$run.pickle
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated (done up to run5)
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2
|
||||||
|
done
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||||
|
logfile=./results/10run_rcv_final_results.csv
|
||||||
|
|
||||||
|
runs='0 1 2 3 4 5 6 7 8 9'
|
||||||
|
|
||||||
|
for run in $runs
|
||||||
|
do
|
||||||
|
dataset=$dataset_path$run.pickle
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,16 @@
|
||||||
|
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||||
|
logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv
|
||||||
|
|
||||||
|
runs='0 1 2 3 4 5 6 7 8 9'
|
||||||
|
for run in $runs
|
||||||
|
do
|
||||||
|
dataset=$dataset_path$run.pickle
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -r -z --l2
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -c -P -U -S -r -z --l2
|
||||||
|
done
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
|
||||||
|
logfile=./results/final_combinations_jrc.csv
|
||||||
|
#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
|
||||||
|
# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
|
||||||
|
# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
|
||||||
|
|
||||||
|
# aggregation=concatenation
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2
|
||||||
|
#
|
||||||
|
|
||||||
|
##FeatureSetToPosteriors (aggregation mean)
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
|
||||||
|
|
||||||
|
##FeatureSetToPosteriors
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
|
||||||
|
|
||||||
|
#MajorityVoting
|
||||||
|
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
|
||||||
|
#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
|
||||||
|
#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
|
||||||
|
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,31 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
|
||||||
|
logfile=./results/final_combinations_rcv.csv
|
||||||
|
#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
|
||||||
|
# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
|
||||||
|
# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
|
||||||
|
|
||||||
|
# aggregation=concatenation
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2
|
||||||
|
#
|
||||||
|
##FeatureSetToPosteriors (aggregation mean)
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
|
||||||
|
python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
|
||||||
|
|
||||||
|
##FeatureSetToPosteriors
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -r -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -S -r -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -U -S -r -z --l2 --allprob
|
||||||
|
#python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
|
||||||
|
|
||||||
|
#MajorityVoting
|
||||||
|
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
|
||||||
|
#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
|
||||||
|
#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
|
||||||
|
#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r
|
||||||
|
|
@ -0,0 +1,31 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
logfile=../log/log_pre_jrc.csv
|
||||||
|
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20
|
||||||
|
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20
|
||||||
|
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
|
||||||
|
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20
|
||||||
|
|
@ -0,0 +1,30 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
|
||||||
|
python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20
|
||||||
|
|
||||||
|
python main_deep_learning.py $dataset --supervised --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20
|
||||||
|
|
||||||
|
python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
|
||||||
|
|
||||||
|
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
|
||||||
|
python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20
|
||||||
|
|
@ -0,0 +1,16 @@
|
||||||
|
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
|
||||||
|
seeds='5' #2 3 4 5 6 7 8 9 10'
|
||||||
|
for seed in $seeds
|
||||||
|
do
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
|
||||||
|
python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed --force
|
||||||
|
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained --seed $seed
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
|
||||||
|
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed --force
|
||||||
|
|
||||||
|
done
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
|
||||||
|
seeds='1 ' #2 3 4 5' # 6 7 8 9 10'
|
||||||
|
for seed in $seeds
|
||||||
|
do
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
|
||||||
|
python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained --seed $seed
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
|
||||||
|
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed
|
||||||
|
|
||||||
|
# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed
|
||||||
|
# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200
|
||||||
|
#python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed
|
||||||
|
done
|
||||||
|
|
@ -0,0 +1,45 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
|
||||||
|
|
||||||
|
######################################## POSTERIORS
|
||||||
|
# Posteriors
|
||||||
|
python main_multimodal_cls.py $dataset -P # + zscore
|
||||||
|
python main_multimodal_cls.py $dataset -P -z # +l2norm
|
||||||
|
python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
|
||||||
|
|
||||||
|
|
||||||
|
######################################### WCE
|
||||||
|
#WCE supervised
|
||||||
|
python main_multimodal_cls.py $dataset -S # + zscore
|
||||||
|
python main_multimodal_cls.py $dataset -S -z # +l2norm
|
||||||
|
python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
|
||||||
|
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 # +feature weight + pca
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 # + SIF
|
||||||
|
|
||||||
|
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig # + pca
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig
|
||||||
|
|
||||||
|
|
||||||
|
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi
|
||||||
|
|
||||||
|
################################# MUSE
|
||||||
|
|
||||||
|
# MUSE unsupervised
|
||||||
|
python main_multimodal_cls.py $dataset -U # + zscore
|
||||||
|
python main_multimodal_cls.py $dataset -U -z # +l2norm
|
||||||
|
python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
|
||||||
|
python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
|
||||||
|
|
||||||
|
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
|
||||||
|
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
|
||||||
|
|
||||||
|
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
|
||||||
|
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi
|
||||||
|
|
@ -0,0 +1,45 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
|
||||||
|
|
||||||
|
######################################## POSTERIORS
|
||||||
|
# Posteriors
|
||||||
|
python main_multimodal_cls.py $dataset -P # + zscore
|
||||||
|
python main_multimodal_cls.py $dataset -P -z # +l2norm
|
||||||
|
python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
|
||||||
|
|
||||||
|
|
||||||
|
######################################### WCE
|
||||||
|
#WCE supervised
|
||||||
|
python main_multimodal_cls.py $dataset -S # + zscore
|
||||||
|
python main_multimodal_cls.py $dataset -S -z # +l2norm
|
||||||
|
python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
|
||||||
|
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 # +feature weight + pca
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 # + SIF
|
||||||
|
|
||||||
|
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig # + pca
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig
|
||||||
|
|
||||||
|
|
||||||
|
python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi
|
||||||
|
python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi
|
||||||
|
|
||||||
|
################################# MUSE
|
||||||
|
|
||||||
|
# MUSE unsupervised
|
||||||
|
python main_multimodal_cls.py $dataset -U # + zscore
|
||||||
|
python main_multimodal_cls.py $dataset -U -z # +l2norm
|
||||||
|
python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
|
||||||
|
python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
|
||||||
|
|
||||||
|
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
|
||||||
|
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
|
||||||
|
|
||||||
|
python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
|
||||||
|
python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
|
||||||
|
seeds='1 2 3 4 5 6 7 8 9 10'
|
||||||
|
for seed in $seeds
|
||||||
|
do
|
||||||
|
python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised --nepochs 50 --seed $seed
|
||||||
|
done
|
||||||
|
|
@ -1,15 +1,15 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import time
|
import time
|
||||||
from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
|
# from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
|
||||||
from scipy.sparse import issparse
|
from scipy.sparse import issparse
|
||||||
from sklearn.multiclass import OneVsRestClassifier
|
from sklearn.multiclass import OneVsRestClassifier
|
||||||
from sklearn.model_selection import GridSearchCV
|
from sklearn.model_selection import GridSearchCV
|
||||||
from sklearn.model_selection import KFold
|
# from sklearn.model_selection import KFold
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
# from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from transformers.StandardizeTransformer import StandardizeTransformer
|
# from util_transformers.StandardizeTransformer import StandardizeTransformer
|
||||||
from sklearn.decomposition import PCA
|
# from sklearn.decomposition import PCA
|
||||||
from models.cnn_class_bu import CNN_pdr
|
# from models.cnn_class_bu import CNN_pdr
|
||||||
|
|
||||||
|
|
||||||
def _sort_if_sparse(X):
|
def _sort_if_sparse(X):
|
||||||
|
|
@ -40,154 +40,154 @@ class TrivialRejector:
|
||||||
def best_params(self): return {}
|
def best_params(self): return {}
|
||||||
|
|
||||||
|
|
||||||
class FunnellingPolylingualClassifier:
|
# class FunnellingPolylingualClassifier:
|
||||||
"""
|
# """
|
||||||
This classifier projects each document d into a language-independent feature space where each dimension fi is the
|
# This classifier projects each document d into a language-independent feature space where each dimension fi is the
|
||||||
decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
|
# decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
|
||||||
then trains one single classifier for all documents in this space, irrespective of their originary language
|
# then trains one single classifier for all documents in this space, irrespective of their originary language
|
||||||
"""
|
# """
|
||||||
def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
|
# def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
|
||||||
calmode='cal', n_jobs=-1):
|
# calmode='cal', n_jobs=-1):
|
||||||
"""
|
# """
|
||||||
:param first_tier_learner: the learner used in the first-tier level
|
# :param first_tier_learner: the learner used in the first-tier level
|
||||||
:param meta_learner: the learner used in the second-tier level
|
# :param meta_learner: the learner used in the second-tier level
|
||||||
:param first_tier_parameters: parameters for the learner in the doc_projector
|
# :param first_tier_parameters: parameters for the learner in the doc_projector
|
||||||
:param meta_parameters: parameters for the learner in the z-space
|
# :param meta_parameters: parameters for the learner in the z-space
|
||||||
:param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
|
# :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
|
||||||
:param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
|
# :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
|
||||||
:param n_jobs: number of parallel threads
|
# :param n_jobs: number of parallel threads
|
||||||
'sigmoid' to use the sigmoid of the decision_function
|
# 'sigmoid' to use the sigmoid of the decision_function
|
||||||
projects the data before training the final classifier; if greater than one, the training set is split in as
|
# projects the data before training the final classifier; if greater than one, the training set is split in as
|
||||||
many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
|
# many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
|
||||||
models trained on the remaining folds. This should increase the generality of the space to unseen data.
|
# models trained on the remaining folds. This should increase the generality of the space to unseen data.
|
||||||
"""
|
# """
|
||||||
assert folded_projections>0, "positive number of folds expected"
|
# assert folded_projections>0, "positive number of folds expected"
|
||||||
assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
|
# assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
|
||||||
assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
|
# assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
|
||||||
|
#
|
||||||
self.fist_tier_learner = first_tier_learner
|
# self.fist_tier_learner = first_tier_learner
|
||||||
self.meta_learner = meta_learner
|
# self.meta_learner = meta_learner
|
||||||
self.fist_tier_parameters=first_tier_parameters
|
# self.fist_tier_parameters=first_tier_parameters
|
||||||
self.meta_parameters = meta_parameters
|
# self.meta_parameters = meta_parameters
|
||||||
self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
|
# self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
|
||||||
self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
|
# self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
|
||||||
self.folded_projections = folded_projections
|
# self.folded_projections = folded_projections
|
||||||
self.n_jobs = n_jobs
|
# self.n_jobs = n_jobs
|
||||||
self.calmode = calmode
|
# self.calmode = calmode
|
||||||
|
#
|
||||||
def _projection(self, doc_projector, lX):
|
# def _projection(self, doc_projector, lX):
|
||||||
"""
|
# """
|
||||||
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
|
# Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
|
||||||
decision_function if otherwise
|
# decision_function if otherwise
|
||||||
:param doc_projector: the document projector (a NaivePolylingualClassifier)
|
# :param doc_projector: the document projector (a NaivePolylingualClassifier)
|
||||||
:param lX: {lang:matrix} to train
|
# :param lX: {lang:matrix} to train
|
||||||
:return: the projection, applied with predict_proba or decision_function
|
# :return: the projection, applied with predict_proba or decision_function
|
||||||
"""
|
# """
|
||||||
if self.calmode=='cal':
|
# if self.calmode=='cal':
|
||||||
return doc_projector.predict_proba(lX)
|
# return doc_projector.predict_proba(lX)
|
||||||
else:
|
# else:
|
||||||
l_decision_scores = doc_projector.decision_function(lX)
|
# l_decision_scores = doc_projector.decision_function(lX)
|
||||||
if self.calmode=='sigmoid':
|
# if self.calmode=='sigmoid':
|
||||||
def sigmoid(x): return 1 / (1 + np.exp(-x))
|
# def sigmoid(x): return 1 / (1 + np.exp(-x))
|
||||||
for lang in l_decision_scores.keys():
|
# for lang in l_decision_scores.keys():
|
||||||
l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
|
# l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
|
||||||
return l_decision_scores
|
# return l_decision_scores
|
||||||
|
#
|
||||||
def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
|
# def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
|
||||||
"""
|
# """
|
||||||
Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
|
# Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
|
||||||
decision scores (if otherwise). This space is here named zspace.
|
# decision scores (if otherwise). This space is here named zspace.
|
||||||
:param lXtr: {lang:matrix} to train
|
# :param lXtr: {lang:matrix} to train
|
||||||
:param lYtr: {lang:labels} to train
|
# :param lYtr: {lang:labels} to train
|
||||||
:param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
|
# :param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
|
||||||
:param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
|
# :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
|
||||||
:return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
|
# :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
|
||||||
models trained on lXtr, and the lYproj labels stacked consistently
|
# models trained on lXtr, and the lYproj labels stacked consistently
|
||||||
"""
|
# """
|
||||||
repair_empty_folds = True
|
# repair_empty_folds = True
|
||||||
if lXproj is None and lYproj is None:
|
# if lXproj is None and lYproj is None:
|
||||||
lXproj, lYproj = lXtr, lYtr
|
# lXproj, lYproj = lXtr, lYtr
|
||||||
repair_empty_folds = False
|
# repair_empty_folds = False
|
||||||
|
#
|
||||||
print('fitting the projectors... {}'.format(lXtr.keys()))
|
# print('fitting the projectors... {}'.format(lXtr.keys()))
|
||||||
self.doc_projector.fit(lXtr, lYtr)
|
# self.doc_projector.fit(lXtr, lYtr)
|
||||||
|
#
|
||||||
print('projecting the documents')
|
# print('projecting the documents')
|
||||||
langs = list(lXtr.keys())
|
# langs = list(lXtr.keys())
|
||||||
lZ = self._projection(self.doc_projector, lXproj)
|
# lZ = self._projection(self.doc_projector, lXproj)
|
||||||
|
#
|
||||||
# if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
|
# # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
|
||||||
empty_categories = self.doc_projector.empty_categories
|
# empty_categories = self.doc_projector.empty_categories
|
||||||
lZ_bu = self._projection(self.doc_projector_bu, lXproj)
|
# lZ_bu = self._projection(self.doc_projector_bu, lXproj)
|
||||||
|
#
|
||||||
for lang in langs:
|
# for lang in langs:
|
||||||
repair = empty_categories[lang]
|
# repair = empty_categories[lang]
|
||||||
lZ[lang][:,repair] = lZ_bu[lang][:,repair]
|
# lZ[lang][:,repair] = lZ_bu[lang][:,repair]
|
||||||
|
#
|
||||||
Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
|
# Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
|
||||||
zy = np.vstack([lYproj[lang] for lang in langs])
|
# zy = np.vstack([lYproj[lang] for lang in langs])
|
||||||
return Z, zy
|
# return Z, zy
|
||||||
|
#
|
||||||
def _get_zspace_folds(self, lX, ly):
|
# def _get_zspace_folds(self, lX, ly):
|
||||||
self.doc_projector_bu.fit(lX, ly)
|
# self.doc_projector_bu.fit(lX, ly)
|
||||||
|
#
|
||||||
print('split of {} folds'.format(self.folded_projections))
|
# print('split of {} folds'.format(self.folded_projections))
|
||||||
skf = KFold(n_splits=self.folded_projections, shuffle=True)
|
# skf = KFold(n_splits=self.folded_projections, shuffle=True)
|
||||||
|
#
|
||||||
Z, zy = [], []
|
# Z, zy = [], []
|
||||||
lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
|
# lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
|
||||||
for fold in range(self.folded_projections):
|
# for fold in range(self.folded_projections):
|
||||||
print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
|
# print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
|
||||||
lfoldXtr, lfoldYtr = {}, {}
|
# lfoldXtr, lfoldYtr = {}, {}
|
||||||
lfoldXte, lfoldYte = {}, {}
|
# lfoldXte, lfoldYte = {}, {}
|
||||||
for lang in lX.keys():
|
# for lang in lX.keys():
|
||||||
train, test = lfold[lang][fold]
|
# train, test = lfold[lang][fold]
|
||||||
lfoldXtr[lang] = lX[lang][train]
|
# lfoldXtr[lang] = lX[lang][train]
|
||||||
lfoldYtr[lang] = ly[lang][train]
|
# lfoldYtr[lang] = ly[lang][train]
|
||||||
lfoldXte[lang] = lX[lang][test]
|
# lfoldXte[lang] = lX[lang][test]
|
||||||
lfoldYte[lang] = ly[lang][test]
|
# lfoldYte[lang] = ly[lang][test]
|
||||||
Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
|
# Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
|
||||||
Z.append(Zfold)
|
# Z.append(Zfold)
|
||||||
zy.append(zYfold)
|
# zy.append(zYfold)
|
||||||
# compose the Z-space as the union of all folded predictions
|
# # compose the Z-space as the union of all folded predictions
|
||||||
Z = np.vstack(Z)
|
# Z = np.vstack(Z)
|
||||||
zy = np.vstack(zy)
|
# zy = np.vstack(zy)
|
||||||
# refit the document projector with all examples to have a more reliable projector for test data
|
# # refit the document projector with all examples to have a more reliable projector for test data
|
||||||
self.doc_projector = self.doc_projector_bu
|
# self.doc_projector = self.doc_projector_bu
|
||||||
return Z, zy
|
# return Z, zy
|
||||||
|
#
|
||||||
def fit(self, lX, ly, lZ=None, lzy=None):
|
# def fit(self, lX, ly, lZ=None, lzy=None):
|
||||||
tinit = time.time()
|
# tinit = time.time()
|
||||||
Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
|
# Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
|
||||||
|
#
|
||||||
#experimental: adds the posterior probabilities (computed outside) to the meta-classifier
|
# #experimental: adds the posterior probabilities (computed outside) to the meta-classifier
|
||||||
if lZ is not None and lzy is not None:
|
# if lZ is not None and lzy is not None:
|
||||||
zlangs = list(lZ.keys())
|
# zlangs = list(lZ.keys())
|
||||||
Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
|
# Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
|
||||||
zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
|
# zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
|
||||||
|
#
|
||||||
print('fitting the Z-space of shape={}'.format(Z.shape))
|
# print('fitting the Z-space of shape={}'.format(Z.shape))
|
||||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
|
# self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
|
||||||
self.model.fit(Z, zy)
|
# self.model.fit(Z, zy)
|
||||||
self.time = time.time() - tinit
|
# self.time = time.time() - tinit
|
||||||
|
#
|
||||||
return self
|
# return self
|
||||||
|
#
|
||||||
def predict(self, lX, lZ=None):
|
# def predict(self, lX, lZ=None):
|
||||||
"""
|
# """
|
||||||
:param lX: a dictionary {language_label: X csr-matrix}
|
# :param lX: a dictionary {language_label: X csr-matrix}
|
||||||
:param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
|
# :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
|
||||||
:return: a dictionary of predictions
|
# :return: a dictionary of predictions
|
||||||
"""
|
# """
|
||||||
lZ_ = self._projection(self.doc_projector, lX)
|
# lZ_ = self._projection(self.doc_projector, lX)
|
||||||
if lZ is not None:
|
# if lZ is not None:
|
||||||
lZ_ = {**lZ_, **lZ}
|
# lZ_ = {**lZ_, **lZ}
|
||||||
return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
|
# return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
|
||||||
|
#
|
||||||
def best_params(self):
|
# def best_params(self):
|
||||||
params = self.doc_projector.best_params()
|
# params = self.doc_projector.best_params()
|
||||||
params['meta'] = self.model.best_params()
|
# params['meta'] = self.model.best_params()
|
||||||
return params
|
# return params
|
||||||
|
|
||||||
|
|
||||||
class NaivePolylingualClassifier:
|
class NaivePolylingualClassifier:
|
||||||
|
|
@ -323,410 +323,3 @@ class MonolingualClassifier:
|
||||||
|
|
||||||
def best_params(self):
|
def best_params(self):
|
||||||
return self.best_params_
|
return self.best_params_
|
||||||
|
|
||||||
|
|
||||||
class FunnellingMultimodal(FunnellingPolylingualClassifier):
|
|
||||||
def __init__(self,
|
|
||||||
we_path,
|
|
||||||
config,
|
|
||||||
first_tier_learner,
|
|
||||||
meta_learner,
|
|
||||||
first_tier_parameters=None,
|
|
||||||
meta_parameters=None,
|
|
||||||
folded_projections=1,
|
|
||||||
calmode='cal',
|
|
||||||
n_jobs=-1):
|
|
||||||
|
|
||||||
super().__init__(first_tier_learner,
|
|
||||||
meta_learner,
|
|
||||||
first_tier_parameters,
|
|
||||||
meta_parameters,
|
|
||||||
folded_projections,
|
|
||||||
calmode,
|
|
||||||
n_jobs)
|
|
||||||
|
|
||||||
self.pca_independent_space = PCA(n_components=50)
|
|
||||||
self.we_path = we_path
|
|
||||||
self.config = config
|
|
||||||
self.lang_word2idx = dict()
|
|
||||||
self.languages = []
|
|
||||||
self.lang_tfidf = {}
|
|
||||||
self.embedding_space = None
|
|
||||||
self.model = None
|
|
||||||
self.time = None
|
|
||||||
self.best_components = 'not set' # if auto optimize pca, it will store the optimal number of components
|
|
||||||
|
|
||||||
def vectorize(self, lX, prediction=False):
|
|
||||||
langs = list(lX.keys())
|
|
||||||
print(f'# tfidf-vectorizing docs')
|
|
||||||
if prediction:
|
|
||||||
|
|
||||||
for lang in langs:
|
|
||||||
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
|
|
||||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
|
||||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
|
||||||
return self
|
|
||||||
|
|
||||||
for lang in langs:
|
|
||||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
|
|
||||||
self.languages.append(lang)
|
|
||||||
tfidf_vectorizer.fit(lX[lang])
|
|
||||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
|
||||||
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
|
||||||
self.lang_tfidf[lang] = tfidf_vectorizer
|
|
||||||
return self
|
|
||||||
|
|
||||||
def _get_zspace(self, lXtr, lYtr):
|
|
||||||
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
|
|
||||||
self.doc_projector.fit(lXtr, lYtr)
|
|
||||||
|
|
||||||
print('\nprojecting the documents')
|
|
||||||
lZ = self._projection(self.doc_projector, lXtr)
|
|
||||||
|
|
||||||
return lZ, lYtr
|
|
||||||
|
|
||||||
def fit(self, lX, ly):
|
|
||||||
tinit = time.time()
|
|
||||||
print('Vectorizing documents...')
|
|
||||||
self.vectorize(lX)
|
|
||||||
|
|
||||||
for lang in self.languages:
|
|
||||||
print(f'{lang}->{lX[lang].shape}')
|
|
||||||
|
|
||||||
Z, zy = self._get_zspace(lX, ly)
|
|
||||||
|
|
||||||
if self.config['supervised'] or self.config['unsupervised']:
|
|
||||||
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
|
|
||||||
_embedding_space = self.embedding_space.transform(self.config, lX)
|
|
||||||
if self.config['max_label_space'] == 0:
|
|
||||||
_cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1]
|
|
||||||
if _cum_dimension - 300 > 0:
|
|
||||||
_temp = _cum_dimension - 300
|
|
||||||
else:
|
|
||||||
_temp = _cum_dimension
|
|
||||||
self.best_components = _temp
|
|
||||||
# h_stacking posterior probabilities with (U) and/or (S) matrices
|
|
||||||
for lang in self.languages:
|
|
||||||
Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
|
|
||||||
|
|
||||||
# stacking Z space vertically
|
|
||||||
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
|
||||||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
|
||||||
|
|
||||||
self.standardizer = StandardizeTransformer()
|
|
||||||
_vertical_Z = self.standardizer.fit_transform(_vertical_Z)
|
|
||||||
|
|
||||||
# todo testing ...
|
|
||||||
# if self.config['post_pca']:
|
|
||||||
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
|
|
||||||
# self.pca_independent_space.fit(_vertical_Z)
|
|
||||||
# _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
|
|
||||||
|
|
||||||
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
|
||||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
|
||||||
n_jobs=self.n_jobs)
|
|
||||||
self.model.fit(_vertical_Z, _vertical_Zy)
|
|
||||||
self.time = time.time() - tinit
|
|
||||||
print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
|
|
||||||
|
|
||||||
def predict(self, lX, ly):
|
|
||||||
print('Vectorizing documents')
|
|
||||||
self.vectorize(lX, prediction=True)
|
|
||||||
lZ = self._projection(self.doc_projector, lX)
|
|
||||||
|
|
||||||
if self.config['supervised'] or self.config['unsupervised']:
|
|
||||||
_embedding_space = self.embedding_space.transform(self.config, lX)
|
|
||||||
|
|
||||||
for lang in lX.keys():
|
|
||||||
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
|
|
||||||
|
|
||||||
for lang in lZ.keys():
|
|
||||||
print(lZ[lang].shape)
|
|
||||||
# todo testing
|
|
||||||
lZ[lang] = self.standardizer.transform(lZ[lang])
|
|
||||||
# if self.config['post_pca']:
|
|
||||||
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
|
|
||||||
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
|
|
||||||
|
|
||||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
|
||||||
|
|
||||||
|
|
||||||
class PolylingualEmbeddingsClassifier:
|
|
||||||
"""
|
|
||||||
This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article
|
|
||||||
@article{conneau2017word,
|
|
||||||
title={Word translation without parallel data},
|
|
||||||
author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}},
|
|
||||||
journal={arXiv preprint arXiv:1710.04087},
|
|
||||||
year={2017}
|
|
||||||
}
|
|
||||||
url: https://github.com/facebookresearch/MUSE
|
|
||||||
"""
|
|
||||||
def __init__(self, wordembeddings_path, config, learner, c_parameters=None, n_jobs=-1):
|
|
||||||
"""
|
|
||||||
:param wordembeddings_path: the path to the directory containing the polylingual embeddings
|
|
||||||
:param learner: the learner
|
|
||||||
:param c_parameters: parameters for learner
|
|
||||||
:param n_jobs: the number of concurrent threads
|
|
||||||
"""
|
|
||||||
self.wordembeddings_path = wordembeddings_path
|
|
||||||
self.config = config
|
|
||||||
self.learner = learner
|
|
||||||
self.c_parameters=c_parameters
|
|
||||||
self.n_jobs = n_jobs
|
|
||||||
self.lang_tfidf = {}
|
|
||||||
self.model = None
|
|
||||||
self.languages = []
|
|
||||||
self.lang_word2idx = dict()
|
|
||||||
self.embedding_space = None
|
|
||||||
|
|
||||||
def fit_vectorizers(self, lX):
|
|
||||||
for lang in lX.keys():
|
|
||||||
if lang not in self.lang_tfidf:
|
|
||||||
tfidf = TfidfVectorizer(sublinear_tf=True, use_idf=True) # text is already processed
|
|
||||||
docs = lX[lang]
|
|
||||||
tfidf.fit(docs)
|
|
||||||
self.lang_tfidf[lang] = tfidf
|
|
||||||
|
|
||||||
|
|
||||||
def vectorize(self, lX, prediction=False):
|
|
||||||
langs = list(lX.keys())
|
|
||||||
print(f'# tfidf-vectorizing docs')
|
|
||||||
if prediction:
|
|
||||||
|
|
||||||
for lang in langs:
|
|
||||||
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
|
|
||||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
|
||||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
|
||||||
return self
|
|
||||||
|
|
||||||
for lang in langs:
|
|
||||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
|
|
||||||
self.languages.append(lang)
|
|
||||||
tfidf_vectorizer.fit(lX[lang])
|
|
||||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
|
||||||
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
|
||||||
self.lang_tfidf[lang] = tfidf_vectorizer
|
|
||||||
return self
|
|
||||||
|
|
||||||
def embed(self, docs, lang):
|
|
||||||
assert lang in self.lang_tfidf, 'unknown language'
|
|
||||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
|
||||||
V = tfidf_vectorizer.vocabulary_
|
|
||||||
Xweights = tfidf_vectorizer.transform(docs)
|
|
||||||
|
|
||||||
print('loading word embeddings for ' + lang)
|
|
||||||
we = WordEmbeddings.load(self.wordembeddings_path, lang)
|
|
||||||
|
|
||||||
nD = len(docs)
|
|
||||||
doc_vecs = np.zeros((nD, we.dim()))
|
|
||||||
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
print('\r\tcomplete {:.3f}%'.format(100 * (i + 1) / nD), end='')
|
|
||||||
# averaging with tfidf (summing each word only once, since the frequency is already controlled)
|
|
||||||
for w in set(doc.split()):
|
|
||||||
if w in we and w in V:
|
|
||||||
doc_vecs[i] += (we[w] * Xweights[i, V[w]])
|
|
||||||
# works much worse with idf; works much worse with document l2-normalization
|
|
||||||
print()
|
|
||||||
|
|
||||||
return doc_vecs
|
|
||||||
|
|
||||||
def fit(self, lX, ly):
|
|
||||||
"""
|
|
||||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
|
||||||
:param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
|
|
||||||
:return: self
|
|
||||||
"""
|
|
||||||
tinit = time.time()
|
|
||||||
langs = list(lX.keys())
|
|
||||||
WEtr, Ytr = [], []
|
|
||||||
# self.fit_vectorizers(lX) # if already fit, does nothing
|
|
||||||
self.vectorize(lX)
|
|
||||||
# config = {'unsupervised' : False, 'supervised': True}
|
|
||||||
self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly)
|
|
||||||
WEtr = self.embedding_space.transform(self.config, lX)
|
|
||||||
# for lang in langs:
|
|
||||||
# WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices
|
|
||||||
# Ytr.append(ly[lang])
|
|
||||||
|
|
||||||
WEtr = np.vstack([WEtr[lang] for lang in langs])
|
|
||||||
Ytr = np.vstack([ly[lang] for lang in langs])
|
|
||||||
self.embed_time = time.time() - tinit
|
|
||||||
|
|
||||||
print('fitting the WE-space of shape={}'.format(WEtr.shape))
|
|
||||||
self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs)
|
|
||||||
self.model.fit(WEtr, Ytr)
|
|
||||||
self.time = time.time() - tinit
|
|
||||||
return self
|
|
||||||
|
|
||||||
def predict(self, lX, lY):
|
|
||||||
"""
|
|
||||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
|
||||||
"""
|
|
||||||
assert self.model is not None, 'predict called before fit'
|
|
||||||
self.vectorize(lX, prediction=True)
|
|
||||||
langs = list(lX.keys())
|
|
||||||
lWEte = self.embedding_space.transform(self.config, lX)
|
|
||||||
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
|
||||||
return _joblib_transform_multiling(self.model.transform, lWEte, n_jobs=self.n_jobs)
|
|
||||||
|
|
||||||
def predict_proba(self, lX):
|
|
||||||
"""
|
|
||||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
|
||||||
"""
|
|
||||||
assert self.model is not None, 'predict called before fit'
|
|
||||||
langs = list(lX.keys())
|
|
||||||
lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
|
||||||
return _joblib_transform_multiling(self.model.predict_proba, lWEte, n_jobs=self.n_jobs)
|
|
||||||
|
|
||||||
def best_params(self):
|
|
||||||
return self.model.best_params()
|
|
||||||
|
|
||||||
|
|
||||||
class MonolingualNetSvm:
|
|
||||||
"""
|
|
||||||
testing: funnelling with NN managing word embeddings compositionality. An ensemble of n-SVMs (n equals to the
|
|
||||||
number of training languages) is first fit on the data, generating the documents projection in the Z-space. Next,
|
|
||||||
the projection are fed to a single NN with their respective document embeddings. The documents are projected into
|
|
||||||
the embedding space while preserving their dimensionality (output dim is 300). These projection are horizonatally
|
|
||||||
concatenated with the respective projection and passed through a fC layer with sigmoid act and output dim equal
|
|
||||||
to the number of target classes.
|
|
||||||
# TODO ATM testing with only 1 language
|
|
||||||
"""
|
|
||||||
def __init__(self, lX, ly, first_tier_learner, first_tier_parameters, n_jobs):
|
|
||||||
self.lX = lX
|
|
||||||
self.ly = ly
|
|
||||||
# SVM Attributes
|
|
||||||
self.doc_projector = NaivePolylingualClassifier(first_tier_learner, first_tier_parameters,
|
|
||||||
n_jobs=n_jobs)
|
|
||||||
self.calmode = 'cal'
|
|
||||||
self.languages = []
|
|
||||||
self.lang_word2idx = dict()
|
|
||||||
self.lang_tfidf = {}
|
|
||||||
self.base_learner = 'TODO'
|
|
||||||
self.parameters = 'TODO'
|
|
||||||
# NN Attributes
|
|
||||||
self.NN = 'TODO'
|
|
||||||
|
|
||||||
|
|
||||||
def load_preprocessed(self):
|
|
||||||
"""
|
|
||||||
in order to speed up the process, documents are first tokenized in the "main". Here, tokenized docs, word_index, and
|
|
||||||
targets are loaded.
|
|
||||||
:return: dict[lang] = (word_index, tokenized_docs, targets)
|
|
||||||
"""
|
|
||||||
import pickle
|
|
||||||
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
|
|
||||||
return pickle.load(f)
|
|
||||||
|
|
||||||
def _build_embedding_matrix(self, lang, word_index):
|
|
||||||
"""
|
|
||||||
build embedding matrix by filtering out OOV embeddings
|
|
||||||
:param lang:
|
|
||||||
:param word_index:
|
|
||||||
:return: filtered embedding matrix
|
|
||||||
"""
|
|
||||||
from embeddings.embeddings import EmbeddingsAligned
|
|
||||||
type = 'MUSE'
|
|
||||||
path = '/home/andreapdr/CLESA/'
|
|
||||||
MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
|
|
||||||
return MUSE
|
|
||||||
|
|
||||||
def get_data_and_embed(self, data_dict):
|
|
||||||
from keras.preprocessing.sequence import pad_sequences
|
|
||||||
|
|
||||||
langs = data_dict.keys()
|
|
||||||
lang_embedding_matrix = dict()
|
|
||||||
nn_lXtr = dict()
|
|
||||||
nn_lytr = dict()
|
|
||||||
|
|
||||||
for lang in langs:
|
|
||||||
lang_embedding_matrix[lang] = self._build_embedding_matrix(lang, data_dict[lang][0])
|
|
||||||
nn_lXtr[lang] = pad_sequences(data_dict[lang][1], 100, padding='post')
|
|
||||||
nn_lytr[lang] = [data_dict[lang][2]]
|
|
||||||
|
|
||||||
return nn_lXtr, nn_lytr, lang_embedding_matrix
|
|
||||||
|
|
||||||
def svm_vectorize(self, lX, prediction=False):
|
|
||||||
langs = list(lX.keys())
|
|
||||||
print(f'# tfidf-vectorizing docs')
|
|
||||||
if prediction:
|
|
||||||
for lang in langs:
|
|
||||||
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
|
|
||||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
|
||||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
|
||||||
return self
|
|
||||||
for lang in langs:
|
|
||||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
|
|
||||||
self.languages.append(lang)
|
|
||||||
tfidf_vectorizer.fit(lX[lang])
|
|
||||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
|
||||||
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
|
||||||
self.lang_tfidf[lang] = tfidf_vectorizer
|
|
||||||
return lX
|
|
||||||
|
|
||||||
def _get_zspace(self, lXtr, lYtr):
|
|
||||||
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
|
|
||||||
self.doc_projector.fit(lXtr, lYtr)
|
|
||||||
|
|
||||||
print('\nprojecting the documents')
|
|
||||||
lZ = self._projection(self.doc_projector, lXtr)
|
|
||||||
|
|
||||||
return lZ, lYtr
|
|
||||||
|
|
||||||
def _projection(self, doc_projector, lX):
|
|
||||||
"""
|
|
||||||
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
|
|
||||||
decision_function if otherwise
|
|
||||||
:param doc_projector: the document projector (a NaivePolylingualClassifier)
|
|
||||||
:param lX: {lang:matrix} to train
|
|
||||||
:return: the projection, applied with predict_proba or decision_function
|
|
||||||
"""
|
|
||||||
if self.calmode=='cal':
|
|
||||||
return doc_projector.predict_proba(lX)
|
|
||||||
else:
|
|
||||||
l_decision_scores = doc_projector.decision_function(lX)
|
|
||||||
if self.calmode=='sigmoid':
|
|
||||||
def sigmoid(x): return 1 / (1 + np.exp(-x))
|
|
||||||
for lang in l_decision_scores.keys():
|
|
||||||
l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
|
|
||||||
return l_decision_scores
|
|
||||||
|
|
||||||
def fit(self):
|
|
||||||
"""
|
|
||||||
# 1. Fit SVM to generate posterior probabilities:
|
|
||||||
# 1.1 Gather documents and vectorize them as in other SVM classifiers
|
|
||||||
# 2. Fit NN
|
|
||||||
# 2.1 Gather documents and build NN dataset by indexing wrt embedding matrix
|
|
||||||
# 2.2 Fit NN first-layer to generate compositional doc embedding
|
|
||||||
# 2.3 H-stack doc-embed and posterior P
|
|
||||||
# 2.4 Feed stacked vector to output layer (sigmoid act): output Nc
|
|
||||||
# 2.5 Train it...
|
|
||||||
"""
|
|
||||||
|
|
||||||
# load pre-processed data
|
|
||||||
data_dict = self.load_preprocessed()
|
|
||||||
# build embedding matrices and neural network document training set
|
|
||||||
nn_lXtr, nn_lytr, lang_embedding_matrix = self.get_data_and_embed(data_dict)
|
|
||||||
# TF-IDF vectorzing documents for SVM classifier
|
|
||||||
svm_lX = self.svm_vectorize(self.lX)
|
|
||||||
|
|
||||||
# just testing on a smaller subset of data
|
|
||||||
test_svm_lX = dict()
|
|
||||||
test_svm_ly = dict()
|
|
||||||
test_svm_lX['it'] = svm_lX['it'][:10, :]
|
|
||||||
test_svm_ly['it'] = self.ly['it'][:10, :]
|
|
||||||
test_nn_data = nn_lXtr['it'][:10]
|
|
||||||
|
|
||||||
# projecting document into Z space by SVM
|
|
||||||
svm_Z, _ = self._get_zspace(test_svm_lX, test_svm_ly)
|
|
||||||
|
|
||||||
# initializing net and forward pass
|
|
||||||
net = CNN_pdr(73, 1, 300, len(lang_embedding_matrix['it'].vectors), 300, lang_embedding_matrix['it'].vectors)
|
|
||||||
out = net.forward(test_nn_data, svm_Z['it'])
|
|
||||||
|
|
||||||
print('TODO')
|
|
||||||
|
|
||||||
def net(self):
|
|
||||||
pass
|
|
||||||
|
|
@ -10,7 +10,7 @@ import time
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from scipy.sparse import issparse, vstack, hstack
|
from scipy.sparse import issparse, vstack, hstack
|
||||||
from transformers.StandardizeTransformer import StandardizeTransformer
|
from util_transformers.StandardizeTransformer import StandardizeTransformer
|
||||||
from util.SIF_embed import remove_pc
|
from util.SIF_embed import remove_pc
|
||||||
from sklearn.preprocessing import normalize
|
from sklearn.preprocessing import normalize
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
|
|
@ -127,22 +127,26 @@ class PosteriorProbabilitiesEmbedder:
|
||||||
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents')
|
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents')
|
||||||
return self.doc_projector.predict_proba(lX)
|
return self.doc_projector.predict_proba(lX)
|
||||||
|
|
||||||
|
def _get_output_dim(self):
|
||||||
|
return len(self.doc_projector.model['da'].model.classes_)
|
||||||
|
|
||||||
|
|
||||||
class MuseEmbedder:
|
class MuseEmbedder:
|
||||||
|
|
||||||
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight()):
|
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False):
|
||||||
self.path=path
|
self.path=path
|
||||||
self.lV = lV
|
self.lV = lV
|
||||||
self.l2 = l2
|
self.l2 = l2
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
self.featureweight = featureweight
|
self.featureweight = featureweight
|
||||||
|
self.sif = sif
|
||||||
|
|
||||||
def fit(self, lX, ly, lV=None):
|
def fit(self, lX, ly, lV=None):
|
||||||
assert lV is not None or self.lV is not None, 'lV not specified'
|
assert lV is not None or self.lV is not None, 'lV not specified'
|
||||||
self.langs = sorted(lX.keys())
|
self.langs = sorted(lX.keys())
|
||||||
self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
|
self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
|
||||||
lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
|
lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
|
||||||
self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE}
|
self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE.items()}
|
||||||
self.featureweight.fit(lX, ly)
|
self.featureweight.fit(lX, ly)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
@ -150,7 +154,7 @@ class MuseEmbedder:
|
||||||
MUSE = self.MUSE
|
MUSE = self.MUSE
|
||||||
lX = self.featureweight.transform(lX)
|
lX = self.featureweight.transform(lX)
|
||||||
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
||||||
delayed(XdotM)(lX[lang], MUSE[lang]) for lang in self.langs
|
delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs
|
||||||
)
|
)
|
||||||
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
|
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
|
||||||
lMuse = _normalize(lMuse, self.l2)
|
lMuse = _normalize(lMuse, self.l2)
|
||||||
|
|
@ -162,14 +166,18 @@ class MuseEmbedder:
|
||||||
def _get_wordlist_from_word2index(self, word2index):
|
def _get_wordlist_from_word2index(self, word2index):
|
||||||
return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0]
|
return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0]
|
||||||
|
|
||||||
|
def _get_output_dim(self):
|
||||||
|
return self.MUSE['da'].shape[1]
|
||||||
|
|
||||||
|
|
||||||
class WordClassEmbedder:
|
class WordClassEmbedder:
|
||||||
|
|
||||||
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight()):
|
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False):
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
self.l2 = l2
|
self.l2 = l2
|
||||||
self.max_label_space=max_label_space
|
self.max_label_space=max_label_space
|
||||||
self.featureweight = featureweight
|
self.featureweight = featureweight
|
||||||
|
self.sif = sif
|
||||||
|
|
||||||
def fit(self, lX, ly, lV=None):
|
def fit(self, lX, ly, lV=None):
|
||||||
self.langs = sorted(lX.keys())
|
self.langs = sorted(lX.keys())
|
||||||
|
|
@ -184,7 +192,7 @@ class WordClassEmbedder:
|
||||||
lWCE = self.lWCE
|
lWCE = self.lWCE
|
||||||
lX = self.featureweight.transform(lX)
|
lX = self.featureweight.transform(lX)
|
||||||
XdotWCE = Parallel(n_jobs=self.n_jobs)(
|
XdotWCE = Parallel(n_jobs=self.n_jobs)(
|
||||||
delayed(XdotM)(lX[lang], lWCE[lang])for lang in self.langs
|
delayed(XdotM)(lX[lang], lWCE[lang], self.sif)for lang in self.langs
|
||||||
)
|
)
|
||||||
lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
|
lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
|
||||||
lwce = _normalize(lwce, self.l2)
|
lwce = _normalize(lwce, self.l2)
|
||||||
|
|
@ -193,6 +201,9 @@ class WordClassEmbedder:
|
||||||
def fit_transform(self, lX, ly, lV=None):
|
def fit_transform(self, lX, ly, lV=None):
|
||||||
return self.fit(lX, ly).transform(lX)
|
return self.fit(lX, ly).transform(lX)
|
||||||
|
|
||||||
|
def _get_output_dim(self):
|
||||||
|
return 73
|
||||||
|
|
||||||
|
|
||||||
class DocEmbedderList:
|
class DocEmbedderList:
|
||||||
|
|
||||||
|
|
@ -201,6 +212,7 @@ class DocEmbedderList:
|
||||||
if len(embedder_list)==0: embedder_list=[]
|
if len(embedder_list)==0: embedder_list=[]
|
||||||
self.embedders = embedder_list
|
self.embedders = embedder_list
|
||||||
self.aggregation = aggregation
|
self.aggregation = aggregation
|
||||||
|
print(f'Aggregation mode: {self.aggregation}')
|
||||||
|
|
||||||
def fit(self, lX, ly, lV=None):
|
def fit(self, lX, ly, lV=None):
|
||||||
for transformer in self.embedders:
|
for transformer in self.embedders:
|
||||||
|
|
@ -238,16 +250,25 @@ class DocEmbedderList:
|
||||||
langs = sorted(lX.keys())
|
langs = sorted(lX.keys())
|
||||||
|
|
||||||
lZparts = {l: None for l in langs}
|
lZparts = {l: None for l in langs}
|
||||||
|
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
|
||||||
|
min_dim = 300
|
||||||
for transformer in self.embedders:
|
for transformer in self.embedders:
|
||||||
lZ = transformer.transform(lX)
|
lZ = transformer.transform(lX)
|
||||||
|
nC = min([lZ[lang].shape[1] for lang in langs])
|
||||||
for l in langs:
|
for l in langs:
|
||||||
Z = lZ[l]
|
Z = lZ[l]
|
||||||
|
if Z.shape[1] > min_dim:
|
||||||
|
print(f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
|
||||||
|
f'Applying PCA(n_components={min_dim})')
|
||||||
|
pca = PCA(n_components=min_dim)
|
||||||
|
Z = pca.fit(Z).transform(Z)
|
||||||
if lZparts[l] is None:
|
if lZparts[l] is None:
|
||||||
lZparts[l] = Z
|
lZparts[l] = Z
|
||||||
else:
|
else:
|
||||||
lZparts[l] += Z
|
lZparts[l] += Z
|
||||||
|
|
||||||
n_transformers = len(self.embedders)
|
n_transformers = len(self.embedders)
|
||||||
|
nC = min([lZparts[lang].shape[1] for lang in langs])
|
||||||
|
|
||||||
return {l:lZparts[l] / n_transformers for l in langs}
|
return {l:lZparts[l] / n_transformers for l in langs}
|
||||||
|
|
||||||
|
|
@ -266,7 +287,7 @@ class FeatureSet2Posteriors:
|
||||||
self.transformer = transformer
|
self.transformer = transformer
|
||||||
self.l2=l2
|
self.l2=l2
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
self.prob_classifier = MetaClassifier(SVC(kernel='rbf', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
self.prob_classifier = MetaClassifier(SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
||||||
|
|
||||||
def fit(self, lX, ly, lV=None):
|
def fit(self, lX, ly, lV=None):
|
||||||
if lV is None and hasattr(self.transformer, 'lV'):
|
if lV is None and hasattr(self.transformer, 'lV'):
|
||||||
|
|
@ -412,11 +433,13 @@ def word_class_embedding_matrix(X, Y, max_label_space=300):
|
||||||
return WCE
|
return WCE
|
||||||
|
|
||||||
|
|
||||||
def XdotM(X,M):
|
def XdotM(X,M, sif):
|
||||||
# return X.dot(M)
|
# return X.dot(M)
|
||||||
# print(f'X={X.shape}, M={M.shape}')
|
print(f'X={X.shape}, M={M.shape}')
|
||||||
E = X.dot(M)
|
E = X.dot(M)
|
||||||
E = remove_pc(E, npc=1)
|
if sif:
|
||||||
|
print("removing pc...")
|
||||||
|
E = remove_pc(E, npc=1)
|
||||||
return E
|
return E
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,92 +0,0 @@
|
||||||
from optparse import OptionParser
|
|
||||||
from util.results import PolylingualClassificationResults
|
|
||||||
from dataset_builder import MultilingualDataset
|
|
||||||
from keras.preprocessing.text import Tokenizer
|
|
||||||
from learning.learners import MonolingualNetSvm
|
|
||||||
from sklearn.svm import SVC
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
parser = OptionParser()
|
|
||||||
|
|
||||||
parser.add_option("-d", "--dataset", dest="dataset",
|
|
||||||
help="Path to the multilingual dataset processed and stored in .pickle format",
|
|
||||||
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
|
||||||
|
|
||||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
|
||||||
help="Optimize hyperparameters", default=False)
|
|
||||||
|
|
||||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
|
||||||
help="Set the C parameter", default=1)
|
|
||||||
|
|
||||||
(op, args) = parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
###################################################################################################################
|
|
||||||
|
|
||||||
def get_learner(calibrate=False, kernel='linear'):
|
|
||||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
|
|
||||||
|
|
||||||
|
|
||||||
def get_params(dense=False):
|
|
||||||
if not op.optimc:
|
|
||||||
return None
|
|
||||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
|
||||||
kernel = 'rbf' if dense else 'linear'
|
|
||||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
|
||||||
|
|
||||||
|
|
||||||
# PREPROCESS TEXT AND SAVE IT ... both for SVM and NN
|
|
||||||
def preprocess_data(lXtr, lXte, lytr, lyte):
|
|
||||||
tokenized_tr = dict()
|
|
||||||
tokenized_te = dict()
|
|
||||||
for lang in lXtr.keys():
|
|
||||||
alltexts = ' '.join(lXtr[lang])
|
|
||||||
tokenizer = Tokenizer()
|
|
||||||
tokenizer.fit_on_texts(alltexts.split(' '))
|
|
||||||
tokenizer.oov_token = len(tokenizer.word_index)+1
|
|
||||||
# dumping train set
|
|
||||||
sequences_tr = tokenizer.texts_to_sequences(lXtr[lang])
|
|
||||||
tokenized_tr[lang] = (tokenizer.word_index, sequences_tr, lytr[lang])
|
|
||||||
# dumping test set
|
|
||||||
sequences_te = tokenizer.texts_to_sequences(lXte[lang])
|
|
||||||
tokenized_te[lang] = (tokenizer.word_index, sequences_te, lyte[lang])
|
|
||||||
|
|
||||||
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'wb') as f:
|
|
||||||
pickle.dump(tokenized_tr, f)
|
|
||||||
|
|
||||||
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_test.pickle', 'wb') as f:
|
|
||||||
pickle.dump(tokenized_tr, f)
|
|
||||||
|
|
||||||
print('Successfully dumped data')
|
|
||||||
|
|
||||||
# def load_preprocessed():
|
|
||||||
# with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
|
|
||||||
# return pickle.load(f)
|
|
||||||
#
|
|
||||||
# def build_embedding_matrix(lang, word_index):
|
|
||||||
# type = 'MUSE'
|
|
||||||
# path = '/home/andreapdr/CLESA/'
|
|
||||||
# MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
|
|
||||||
# return MUSE
|
|
||||||
|
|
||||||
|
|
||||||
########## MAIN #################################################################################################
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
results = PolylingualClassificationResults('./results/NN_FPEC_results.csv')
|
|
||||||
data = MultilingualDataset.load(op.dataset)
|
|
||||||
lXtr, lytr = data.training()
|
|
||||||
lXte, lyte = data.test()
|
|
||||||
|
|
||||||
if op.set_c != -1:
|
|
||||||
meta_parameters = None
|
|
||||||
else:
|
|
||||||
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
|
|
||||||
|
|
||||||
test_architecture = MonolingualNetSvm(lXtr,
|
|
||||||
lytr,
|
|
||||||
first_tier_learner=get_learner(calibrate=True),
|
|
||||||
first_tier_parameters=None,
|
|
||||||
n_jobs=1)
|
|
||||||
|
|
||||||
test_architecture.fit()
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import argparse
|
import argparse
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from torch.optim.lr_scheduler import StepLR
|
from torch.optim.lr_scheduler import StepLR, MultiStepLR
|
||||||
from dataset_builder import MultilingualDataset
|
from dataset_builder import MultilingualDataset
|
||||||
from learning.transformers import load_muse_embeddings
|
from learning.transformers import load_muse_embeddings
|
||||||
from models.lstm_class import RNNMultilingualClassifier
|
from models.lstm_class import RNNMultilingualClassifier
|
||||||
|
|
@ -9,8 +9,6 @@ from util.early_stop import EarlyStopping
|
||||||
from util.common import *
|
from util.common import *
|
||||||
from util.file import create_if_not_exist
|
from util.file import create_if_not_exist
|
||||||
from time import time
|
from time import time
|
||||||
from embeddings.pretrained import *
|
|
||||||
from os.path import join
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from util.evaluation import evaluate
|
from util.evaluation import evaluate
|
||||||
from util.file import get_file_name
|
from util.file import get_file_name
|
||||||
|
|
@ -100,7 +98,7 @@ def main():
|
||||||
|
|
||||||
# Loading the dataset
|
# Loading the dataset
|
||||||
data = MultilingualDataset.load(opt.dataset)
|
data = MultilingualDataset.load(opt.dataset)
|
||||||
# data.set_view(languages=['de', 'fr', 'sv', 'da', 'es', 'it'])
|
data.set_view(languages=['de', 'fr']) #, 'it', 'en']) # 'sv', 'da', 'es', 'it'])
|
||||||
data.show_dimensions()
|
data.show_dimensions()
|
||||||
langs = data.langs()
|
langs = data.langs()
|
||||||
l_devel_raw, l_devel_target = data.training(target_as_csr=True)
|
l_devel_raw, l_devel_target = data.training(target_as_csr=True)
|
||||||
|
|
@ -108,6 +106,7 @@ def main():
|
||||||
|
|
||||||
# Loading the MUSE pretrained embeddings (only if requested)
|
# Loading the MUSE pretrained embeddings (only if requested)
|
||||||
lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
|
lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
|
||||||
|
# lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set
|
||||||
|
|
||||||
# Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
|
# Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
|
||||||
multilingual_index = MultilingualIndex()
|
multilingual_index = MultilingualIndex()
|
||||||
|
|
@ -115,10 +114,26 @@ def main():
|
||||||
multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
|
multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
|
||||||
multilingual_index.embedding_matrices(lpretrained, opt.supervised)
|
multilingual_index.embedding_matrices(lpretrained, opt.supervised)
|
||||||
if opt.posteriors:
|
if opt.posteriors:
|
||||||
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=opt.svm_max_docs)
|
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000, store_posteriors=True) #stored_post=True) #opt.svm_max_docs)
|
||||||
else:
|
else:
|
||||||
lPtr, lPva, lPte = None, None, None
|
lPtr, lPva, lPte = None, None, None
|
||||||
|
|
||||||
|
# just_test = False
|
||||||
|
# if just_test:
|
||||||
|
#
|
||||||
|
# model = torch.load(
|
||||||
|
# '../checkpoint/rnn(H512)-Muse-WCE-Posteriors-(trainable)-jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')
|
||||||
|
# criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||||
|
#
|
||||||
|
# # batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
|
||||||
|
#
|
||||||
|
# batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
|
||||||
|
# l_test_index = multilingual_index.l_test_index()
|
||||||
|
# epoch = 1
|
||||||
|
# tinit = time()
|
||||||
|
# test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
|
||||||
|
# exit('Loaded')
|
||||||
|
|
||||||
# Model initialization
|
# Model initialization
|
||||||
model = init_Net(data.num_categories(), multilingual_index)
|
model = init_Net(data.num_categories(), multilingual_index)
|
||||||
|
|
||||||
|
|
@ -130,7 +145,7 @@ def main():
|
||||||
|
|
||||||
tinit = time()
|
tinit = time()
|
||||||
create_if_not_exist(opt.checkpoint_dir)
|
create_if_not_exist(opt.checkpoint_dir)
|
||||||
early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
|
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
|
||||||
|
|
||||||
l_train_index, l_train_target = multilingual_index.l_train()
|
l_train_index, l_train_target = multilingual_index.l_train()
|
||||||
l_val_index, l_val_target = multilingual_index.l_val()
|
l_val_index, l_val_target = multilingual_index.l_val()
|
||||||
|
|
@ -155,7 +170,6 @@ def main():
|
||||||
break
|
break
|
||||||
|
|
||||||
# training is over
|
# training is over
|
||||||
|
|
||||||
# restores the best model according to the Mf1 of the validation set (only when plotmode==False)
|
# restores the best model according to the Mf1 of the validation set (only when plotmode==False)
|
||||||
# stoptime = early_stop.stop_time - tinit
|
# stoptime = early_stop.stop_time - tinit
|
||||||
# stopepoch = early_stop.best_epoch
|
# stopepoch = early_stop.best_epoch
|
||||||
|
|
@ -164,6 +178,8 @@ def main():
|
||||||
if opt.plotmode==False:
|
if opt.plotmode==False:
|
||||||
print('-' * 80)
|
print('-' * 80)
|
||||||
print('Training over. Performing final evaluation')
|
print('Training over. Performing final evaluation')
|
||||||
|
|
||||||
|
# torch.cuda.empty_cache()
|
||||||
model = early_stop.restore_checkpoint()
|
model = early_stop.restore_checkpoint()
|
||||||
|
|
||||||
if opt.val_epochs>0:
|
if opt.val_epochs>0:
|
||||||
|
|
@ -183,10 +199,14 @@ def get_lr(optimizer):
|
||||||
|
|
||||||
|
|
||||||
def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name):
|
def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name):
|
||||||
|
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||||
|
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||||
|
|
||||||
loss_history = []
|
loss_history = []
|
||||||
model.train()
|
model.train()
|
||||||
for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)):
|
for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)):
|
||||||
optim.zero_grad()
|
optim.zero_grad()
|
||||||
|
_out = model(batch,post, lang)
|
||||||
loss = criterion(model(batch, post, lang), target)
|
loss = criterion(model(batch, post, lang), target)
|
||||||
loss.backward()
|
loss.backward()
|
||||||
clip_gradient(model)
|
clip_gradient(model)
|
||||||
|
|
@ -195,7 +215,7 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,
|
||||||
|
|
||||||
if idx % opt.log_interval == 0:
|
if idx % opt.log_interval == 0:
|
||||||
interval_loss = np.mean(loss_history[-opt.log_interval:])
|
interval_loss = np.mean(loss_history[-opt.log_interval:])
|
||||||
print(f'{opt.dataset} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
|
print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
|
||||||
|
|
||||||
mean_loss = np.mean(interval_loss)
|
mean_loss = np.mean(interval_loss)
|
||||||
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
|
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
|
||||||
|
|
@ -203,6 +223,8 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,
|
||||||
|
|
||||||
|
|
||||||
def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix):
|
def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix):
|
||||||
|
|
||||||
|
loss_history = []
|
||||||
model.eval()
|
model.eval()
|
||||||
langs = sorted(ltest_index.keys())
|
langs = sorted(ltest_index.keys())
|
||||||
predictions = {l:[] for l in langs}
|
predictions = {l:[] for l in langs}
|
||||||
|
|
@ -214,6 +236,7 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
|
||||||
prediction = predict(logits)
|
prediction = predict(logits)
|
||||||
predictions[lang].append(prediction)
|
predictions[lang].append(prediction)
|
||||||
yte_stacked[lang].append(target.detach().cpu().numpy())
|
yte_stacked[lang].append(target.detach().cpu().numpy())
|
||||||
|
loss_history.append(loss)
|
||||||
|
|
||||||
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
|
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
|
||||||
ly_ = {l:np.vstack(predictions[l]) for l in langs}
|
ly_ = {l:np.vstack(predictions[l]) for l in langs}
|
||||||
|
|
@ -224,17 +247,15 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
|
||||||
metrics.append([macrof1, microf1, macrok, microk])
|
metrics.append([macrof1, microf1, macrok, microk])
|
||||||
if measure_prefix=='te':
|
if measure_prefix=='te':
|
||||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||||
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
|
||||||
# (config['max_label_space'], classifier.best_components),
|
|
||||||
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
|
||||||
# lang, macrof1, microf1, macrok, microk, '')
|
|
||||||
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
||||||
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
||||||
|
|
||||||
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=tend)
|
mean_loss = np.mean(loss_history)
|
||||||
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mf1, timelapse=tend)
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
|
||||||
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-accuracy', value=acc, timelapse=tend)
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
|
||||||
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=loss, timelapse=tend)
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
|
||||||
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
|
||||||
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
|
||||||
|
|
||||||
return Mf1
|
return Mf1
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import os
|
import os
|
||||||
from dataset_builder import MultilingualDataset
|
from dataset_builder import MultilingualDataset
|
||||||
# from learning.learners import *
|
# from learning.learners import *
|
||||||
from learning.learners import FunnellingMultimodal
|
# from learning.learners import FunnellingMultimodal
|
||||||
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
|
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
|
||||||
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting
|
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting
|
||||||
from util.evaluation import *
|
from util.evaluation import *
|
||||||
|
|
@ -14,14 +14,14 @@ from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||||
|
|
||||||
parser = OptionParser()
|
parser = OptionParser()
|
||||||
|
|
||||||
parser.add_option("-d", "--dataset", dest="dataset",
|
# parser.add_option("-d", "--dataset", dest="dataset",
|
||||||
help="Path to the multilingual dataset processed and stored in .pickle format",
|
# help="Path to the multilingual dataset processed and stored in .pickle format",
|
||||||
default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
# default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
||||||
|
|
||||||
parser.add_option("-o", "--output", dest="output",
|
parser.add_option("-o", "--output", dest="output",
|
||||||
help="Result file", type=str, default='./results/results.csv')
|
help="Result file", type=str, default='./results/results.csv')
|
||||||
|
|
||||||
parser.add_option("-P", "--probs", dest="probs", action='store_true',
|
parser.add_option("-P", "--probs", dest="posteriors", action='store_true',
|
||||||
help="Add posterior probabilities to the document embedding representation", default=False)
|
help="Add posterior probabilities to the document embedding representation", default=False)
|
||||||
|
|
||||||
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
|
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
|
||||||
|
|
@ -46,6 +46,9 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
||||||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
||||||
default=300)
|
default=300)
|
||||||
|
|
||||||
|
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
|
||||||
|
help="Remove common component when computing dot product of word embedding matrices", default=False)
|
||||||
|
|
||||||
# parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
# parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
||||||
# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
|
# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
|
||||||
# " If set to 0 it will automatically search for the best number of components", default=300)
|
# " If set to 0 it will automatically search for the best number of components", default=300)
|
||||||
|
|
@ -72,15 +75,18 @@ def get_params(dense=False):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
(op, args) = parser.parse_args()
|
(op, args) = parser.parse_args()
|
||||||
|
|
||||||
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
|
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
|
||||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
dataset = args[0]
|
||||||
assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
|
|
||||||
|
|
||||||
dataset_file = os.path.basename(op.dataset)
|
assert exists(dataset), 'Unable to find file '+str(dataset)
|
||||||
|
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||||
|
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
|
||||||
|
|
||||||
|
dataset_file = os.path.basename(dataset)
|
||||||
|
|
||||||
results = PolylingualClassificationResults(op.output)
|
results = PolylingualClassificationResults(op.output)
|
||||||
|
|
||||||
data = MultilingualDataset.load(op.dataset)
|
data = MultilingualDataset.load(dataset)
|
||||||
data.show_dimensions()
|
data.show_dimensions()
|
||||||
|
|
||||||
lXtr, lytr = data.training()
|
lXtr, lytr = data.training()
|
||||||
|
|
@ -88,8 +94,9 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||||
|
|
||||||
result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
|
# result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
|
||||||
|
result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \
|
||||||
|
f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}'
|
||||||
print(f'{result_id}')
|
print(f'{result_id}')
|
||||||
|
|
||||||
# text preprocessing
|
# text preprocessing
|
||||||
|
|
@ -100,7 +107,7 @@ if __name__ == '__main__':
|
||||||
lV = tfidfvectorizer.vocabulary()
|
lV = tfidfvectorizer.vocabulary()
|
||||||
|
|
||||||
classifiers = []
|
classifiers = []
|
||||||
if op.probs:
|
if op.posteriors:
|
||||||
classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
|
classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
|
||||||
if op.supervised:
|
if op.supervised:
|
||||||
classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S)))
|
classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S)))
|
||||||
|
|
@ -115,13 +122,37 @@ if __name__ == '__main__':
|
||||||
print('\n# Evaluating ...')
|
print('\n# Evaluating ...')
|
||||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||||
|
|
||||||
|
# renaming arguments to be printed on log
|
||||||
|
_id = ''
|
||||||
|
_id_conf = [op.posteriors, op.supervised, op.pretrained]
|
||||||
|
_id_name = ['+P', '+W', '+M']
|
||||||
|
for i, conf in enumerate(_id_conf):
|
||||||
|
if conf:
|
||||||
|
_id += _id_name[i]
|
||||||
|
_id = _id.lstrip('+')
|
||||||
|
_dataset_path = dataset.split('/')[-1].split('_')
|
||||||
|
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||||
|
|
||||||
metrics = []
|
metrics = []
|
||||||
for lang in lXte.keys():
|
for lang in lXte.keys():
|
||||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||||
metrics.append([macrof1, microf1, macrok, microk])
|
metrics.append([macrof1, microf1, macrok, microk])
|
||||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||||
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
results.add_row(method='Voting',
|
||||||
# (config['max_label_space'], classifier.best_components),
|
learner='svm',
|
||||||
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
optimp=op.optimc,
|
||||||
# lang, macrof1, microf1, macrok, microk, '')
|
sif=op.sif,
|
||||||
|
zscore='todo',
|
||||||
|
l2='todo',
|
||||||
|
wescaler='todo',
|
||||||
|
pca=op.max_labels_S,
|
||||||
|
id=_id,
|
||||||
|
dataset=dataset_id,
|
||||||
|
time='todo',
|
||||||
|
lang=lang,
|
||||||
|
macrof1=macrof1,
|
||||||
|
microf1=microf1,
|
||||||
|
macrok=macrok,
|
||||||
|
microk=microk,
|
||||||
|
notes='')
|
||||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ from sklearn.svm import SVC
|
||||||
parser = OptionParser(usage="usage: %prog datapath [options]")
|
parser = OptionParser(usage="usage: %prog datapath [options]")
|
||||||
|
|
||||||
parser.add_option("-o", "--output", dest="output",
|
parser.add_option("-o", "--output", dest="output",
|
||||||
help="Result file", type=str, default='./results/results.csv')
|
help="Result file", type=str, default='multiModal_log.csv')
|
||||||
|
|
||||||
parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true',
|
parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true',
|
||||||
help="Add posterior probabilities to the document embedding representation", default=False)
|
help="Add posterior probabilities to the document embedding representation", default=False)
|
||||||
|
|
@ -22,8 +22,8 @@ parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
|
||||||
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
|
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
|
||||||
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
|
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
|
||||||
|
|
||||||
parser.add_option("--nol2", dest="nol2", action='store_true',
|
parser.add_option("--l2", dest="l2", action='store_true',
|
||||||
help="Deactivates l2 normalization as a post-processing for the document embedding views", default=False)
|
help="Activates l2 normalization as a post-processing for the document embedding views", default=False)
|
||||||
|
|
||||||
parser.add_option("--allprob", dest="allprob", action='store_true',
|
parser.add_option("--allprob", dest="allprob", action='store_true',
|
||||||
help="All views are generated as posterior probabilities. This affects the supervised and pretrained "
|
help="All views are generated as posterior probabilities. This affects the supervised and pretrained "
|
||||||
|
|
@ -48,11 +48,28 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
||||||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
||||||
default=300)
|
default=300)
|
||||||
|
|
||||||
|
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
|
||||||
|
help="Remove common component when computing dot product of word embedding matrices", default=False)
|
||||||
|
|
||||||
|
parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
|
||||||
|
help="Z-score normalize matrices (WCE and MUSE)", default=False)
|
||||||
|
|
||||||
|
parser.add_option("-a", "--agg", dest="agg", action='store_true',
|
||||||
|
help="Set aggregation function of the common Z-space to average (Default: concatenation)", default=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_learner(calibrate=False, kernel='linear'):
|
def get_learner(calibrate=False, kernel='linear'):
|
||||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
|
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
|
||||||
|
|
||||||
|
def get_params():
|
||||||
|
if not op.optimc:
|
||||||
|
return None
|
||||||
|
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||||
|
kernel = 'rbf'
|
||||||
|
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||||
|
|
||||||
|
|
||||||
#######################################################################################################################
|
#######################################################################################################################
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -64,17 +81,23 @@ if __name__ == '__main__':
|
||||||
assert exists(dataset), 'Unable to find file '+str(dataset)
|
assert exists(dataset), 'Unable to find file '+str(dataset)
|
||||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||||
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
|
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
|
||||||
l2=(op.nol2==False)
|
l2=op.l2
|
||||||
|
|
||||||
dataset_file = os.path.basename(dataset)
|
dataset_file = os.path.basename(dataset)
|
||||||
|
|
||||||
results = PolylingualClassificationResults(op.output)
|
results = PolylingualClassificationResults('../log/' + op.output)
|
||||||
allprob='Prob' if op.allprob else ''
|
allprob='Prob' if op.allprob else ''
|
||||||
result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \
|
result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \
|
||||||
f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}{"_optimC" if op.optimc else ""}'
|
f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}_zscore={op.zscore}{"_optimC" if op.optimc else ""}'
|
||||||
print(f'{result_id}')
|
print(f'{result_id}')
|
||||||
|
|
||||||
|
# set zscore range - is slice(0,0) mean will be equal to 0 and std to 1, thus normalization will have no effect
|
||||||
|
standardize_range = slice(0,0)
|
||||||
|
if op.zscore:
|
||||||
|
standardize_range = None
|
||||||
|
|
||||||
data = MultilingualDataset.load(dataset)
|
data = MultilingualDataset.load(dataset)
|
||||||
|
# data.set_view(languages=['fr', 'it'])
|
||||||
data.show_dimensions()
|
data.show_dimensions()
|
||||||
lXtr, lytr = data.training()
|
lXtr, lytr = data.training()
|
||||||
lXte, lyte = data.test()
|
lXte, lyte = data.test()
|
||||||
|
|
@ -86,23 +109,23 @@ if __name__ == '__main__':
|
||||||
feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
|
feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
|
||||||
|
|
||||||
# # document embedding modules
|
# # document embedding modules
|
||||||
doc_embedder = DocEmbedderList(aggregation='concat')
|
doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat')
|
||||||
if op.posteriors:
|
if op.posteriors:
|
||||||
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2))
|
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2))
|
||||||
if op.supervised:
|
if op.supervised:
|
||||||
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting)
|
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||||
if op.allprob:
|
if op.allprob:
|
||||||
wce = FeatureSet2Posteriors(wce, l2=l2)
|
wce = FeatureSet2Posteriors(wce, l2=l2)
|
||||||
doc_embedder.append(wce)
|
doc_embedder.append(wce)
|
||||||
if op.pretrained:
|
if op.pretrained:
|
||||||
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting)
|
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||||
if op.allprob:
|
if op.allprob:
|
||||||
muse = FeatureSet2Posteriors(muse, l2=l2)
|
muse = FeatureSet2Posteriors(muse, l2=l2)
|
||||||
doc_embedder.append(muse)
|
doc_embedder.append(muse)
|
||||||
|
|
||||||
# metaclassifier
|
# metaclassifier
|
||||||
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||||
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=meta_parameters)
|
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=get_params(), standardize_range=standardize_range)
|
||||||
|
|
||||||
# ensembling the modules
|
# ensembling the modules
|
||||||
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
|
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
|
||||||
|
|
@ -113,13 +136,40 @@ if __name__ == '__main__':
|
||||||
print('\n# Evaluating ...')
|
print('\n# Evaluating ...')
|
||||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||||
|
|
||||||
|
# renaming arguments to be printed on log
|
||||||
|
_id = ''
|
||||||
|
_id_conf = [op.posteriors, op.supervised, op.pretrained]
|
||||||
|
_id_name = ['+P', '+W', '+M']
|
||||||
|
for i, conf in enumerate(_id_conf):
|
||||||
|
if conf:
|
||||||
|
_id += _id_name[i]
|
||||||
|
_id = _id.lstrip('+')
|
||||||
|
_id = _id if not op.agg else _id + '_mean'
|
||||||
|
_id = _id if not op.allprob else _id + '_allprob'
|
||||||
|
|
||||||
|
_dataset_path = dataset.split('/')[-1].split('_')
|
||||||
|
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||||
|
|
||||||
metrics = []
|
metrics = []
|
||||||
for lang in lXte.keys():
|
for lang in lXte.keys():
|
||||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||||
metrics.append([macrof1, microf1, macrok, microk])
|
metrics.append([macrof1, microf1, macrok, microk])
|
||||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||||
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
results.add_row(method='MultiModal',
|
||||||
# (config['max_label_space'], classifier.best_components),
|
learner='svm',
|
||||||
# config['dim_reduction_unsupervised'], op.optimc, dataset.split('/')[-1], classifier.time,
|
optimp=op.optimc,
|
||||||
# lang, macrof1, microf1, macrok, microk, '')
|
sif= op.sif,
|
||||||
|
zscore=op.zscore,
|
||||||
|
l2= op.l2,
|
||||||
|
wescaler= op.feat_weight,
|
||||||
|
pca=op.max_labels_S,
|
||||||
|
id=_id,
|
||||||
|
dataset=dataset_id,
|
||||||
|
time='todo',
|
||||||
|
lang=lang,
|
||||||
|
macrof1=macrof1,
|
||||||
|
microf1=microf1,
|
||||||
|
macrok=macrok,
|
||||||
|
microk=microk,
|
||||||
|
notes='')
|
||||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,7 @@ class RNNMultilingualClassifier(nn.Module):
|
||||||
self.n_layers = 1
|
self.n_layers = 1
|
||||||
self.n_directions = 1
|
self.n_directions = 1
|
||||||
|
|
||||||
self.dropout = nn.Dropout(0.2)
|
self.dropout = nn.Dropout(0.6)
|
||||||
|
|
||||||
lstm_out = 256
|
lstm_out = 256
|
||||||
ff1 = 512
|
ff1 = 512
|
||||||
|
|
@ -45,7 +45,7 @@ class RNNMultilingualClassifier(nn.Module):
|
||||||
llearnable_embeddings[l] = learnable_embeddings
|
llearnable_embeddings[l] = learnable_embeddings
|
||||||
self.embedding_length = embedding_length
|
self.embedding_length = embedding_length
|
||||||
|
|
||||||
# self.rnn = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
|
# self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
|
||||||
self.rnn = nn.GRU(self.embedding_length, hidden_size)
|
self.rnn = nn.GRU(self.embedding_length, hidden_size)
|
||||||
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
|
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
|
||||||
self.lpretrained_embeddings.update(lpretrained_embeddings)
|
self.lpretrained_embeddings.update(lpretrained_embeddings)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,355 @@
|
||||||
|
"""
|
||||||
|
Test with smaller subset of languages.
|
||||||
|
|
||||||
|
1. Load doc (RCV1/2)
|
||||||
|
2. Tokenize texts via bertTokenizer (I should already have these dumps)
|
||||||
|
3. Construct better Dataloader/Datasets. NB: I need to keep track of the languages only for
|
||||||
|
the testing phase (but who cares actually? If I have to do it for the testing phase, I think
|
||||||
|
it is better to deploy it also in the training phase...)
|
||||||
|
4. ...
|
||||||
|
5. I have to understand if the pooled hidden state of the last layer is way worse than its averaged
|
||||||
|
version (However, in BertForSeqClassification I guess that the pooled version is passed through
|
||||||
|
the output linear layer in order to get the prediction scores?)
|
||||||
|
6. At the same time, I have to build also an end-to-end model in order to fine-tune it. The previous step
|
||||||
|
would be useful when deploying mBert as a View Generator. (Refactor gFun code with view generators?)
|
||||||
|
7. ...
|
||||||
|
8. Profits
|
||||||
|
|
||||||
|
"""
|
||||||
|
from dataset_builder import MultilingualDataset
|
||||||
|
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
|
||||||
|
from torch.utils.data import Dataset, DataLoader
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from util.common import clip_gradient, predict
|
||||||
|
from time import time
|
||||||
|
from util.csv_log import CSVLog
|
||||||
|
from util.evaluation import evaluate
|
||||||
|
from util.early_stop import EarlyStopping
|
||||||
|
from torch.optim.lr_scheduler import StepLR
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def get_model(n_out):
|
||||||
|
print('# Initializing model ...')
|
||||||
|
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
|
||||||
|
return model
|
||||||
|
|
||||||
|
def set_method_name():
|
||||||
|
return 'mBERT'
|
||||||
|
|
||||||
|
def init_optimizer(model, lr):
|
||||||
|
# return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay)
|
||||||
|
no_decay = ['bias', 'LayerNorm.weight']
|
||||||
|
optimizer_grouped_parameters = [
|
||||||
|
{'params': [p for n, p in model.named_parameters()
|
||||||
|
if not any(nd in n for nd in no_decay)],
|
||||||
|
'weight_decay': opt.weight_decay},
|
||||||
|
{'params': [p for n, p in model.named_parameters()
|
||||||
|
if any(nd in n for nd in no_decay)],
|
||||||
|
'weight_decay': opt.weight_decay}
|
||||||
|
]
|
||||||
|
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
|
||||||
|
return optimizer
|
||||||
|
|
||||||
|
def init_logfile(method_name, opt):
|
||||||
|
logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
|
||||||
|
logfile.set_default('dataset', opt.dataset)
|
||||||
|
logfile.set_default('run', opt.seed)
|
||||||
|
logfile.set_default('method', method_name)
|
||||||
|
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated'
|
||||||
|
return logfile
|
||||||
|
|
||||||
|
def get_lr(optimizer):
|
||||||
|
for param_group in optimizer.param_groups:
|
||||||
|
return param_group['lr']
|
||||||
|
|
||||||
|
def get_dataset_name(datapath):
|
||||||
|
possible_splits = [str(i) for i in range(10)]
|
||||||
|
splitted = datapath.split('_')
|
||||||
|
id_split = splitted[-1].split('.')[0][-1]
|
||||||
|
if id_split in possible_splits:
|
||||||
|
dataset_name = splitted[0].split('/')[-1]
|
||||||
|
return f'{dataset_name}_run{id_split}'
|
||||||
|
|
||||||
|
def load_datasets(datapath):
|
||||||
|
data = MultilingualDataset.load(datapath)
|
||||||
|
data.set_view(languages=['nl']) # Testing with just two langs
|
||||||
|
data.show_dimensions()
|
||||||
|
|
||||||
|
l_devel_raw, l_devel_target = data.training(target_as_csr=False)
|
||||||
|
l_test_raw, l_test_target = data.test(target_as_csr=False)
|
||||||
|
|
||||||
|
return l_devel_raw, l_devel_target, l_test_raw, l_test_target
|
||||||
|
|
||||||
|
|
||||||
|
def do_tokenization(l_dataset, max_len=512):
|
||||||
|
print('# Starting Tokenization ...')
|
||||||
|
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
||||||
|
langs = l_dataset.keys()
|
||||||
|
l_tokenized = {}
|
||||||
|
for lang in langs:
|
||||||
|
l_tokenized[lang] = tokenizer(l_dataset[lang],
|
||||||
|
truncation=True,
|
||||||
|
max_length=max_len,
|
||||||
|
add_special_tokens=True,
|
||||||
|
padding='max_length')
|
||||||
|
return l_tokenized
|
||||||
|
|
||||||
|
|
||||||
|
class TrainingDataset(Dataset):
|
||||||
|
"""
|
||||||
|
data: dict of lang specific tokenized data
|
||||||
|
labels: dict of lang specific targets
|
||||||
|
"""
|
||||||
|
def __init__(self, data, labels):
|
||||||
|
self.langs = data.keys()
|
||||||
|
self.lang_ids = {lang:identifier for identifier, lang in enumerate(self.langs)}
|
||||||
|
|
||||||
|
for i, lang in enumerate(self.langs):
|
||||||
|
# print(lang)
|
||||||
|
_data = data[lang]['input_ids']
|
||||||
|
_data = np.array(_data)
|
||||||
|
_labels = labels[lang]
|
||||||
|
_lang_value = np.full(len(_data), self.lang_ids[lang])
|
||||||
|
|
||||||
|
if i == 0:
|
||||||
|
self.data = _data
|
||||||
|
self.labels = _labels
|
||||||
|
self.lang_index = _lang_value
|
||||||
|
else:
|
||||||
|
self.data = np.vstack((self.data, _data))
|
||||||
|
self.labels = np.vstack((self.labels, _labels))
|
||||||
|
self.lang_index = np.concatenate((self.lang_index, _lang_value))
|
||||||
|
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.data)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
x = self.data[idx]
|
||||||
|
y = self.labels[idx]
|
||||||
|
lang = self.lang_index[idx]
|
||||||
|
|
||||||
|
return x, torch.tensor(y, dtype=torch.float), lang
|
||||||
|
# return x, y, lang
|
||||||
|
|
||||||
|
def get_lang_ids(self):
|
||||||
|
return self.lang_ids
|
||||||
|
|
||||||
|
def freeze_encoder(model):
|
||||||
|
for param in model.base_model.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
return model
|
||||||
|
|
||||||
|
def check_param_grad_status(model):
|
||||||
|
print('#'*50)
|
||||||
|
print('Model paramater status')
|
||||||
|
for name, child in model.named_children():
|
||||||
|
trainable = False
|
||||||
|
for param in child.parameters():
|
||||||
|
if param.requires_grad:
|
||||||
|
trainable = True
|
||||||
|
if not trainable:
|
||||||
|
print(f'{name} is frozen')
|
||||||
|
else:
|
||||||
|
print(f'{name} is not frozen')
|
||||||
|
print('#'*50)
|
||||||
|
|
||||||
|
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile):
|
||||||
|
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||||
|
# dataset_id = 'RCV1/2_run0_newBert'
|
||||||
|
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||||
|
|
||||||
|
loss_history = []
|
||||||
|
model.train()
|
||||||
|
|
||||||
|
for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
|
||||||
|
# optim.zero_grad()
|
||||||
|
out = model(batch.cuda())
|
||||||
|
loss = criterion(out[0], target.cuda())
|
||||||
|
loss.backward()
|
||||||
|
clip_gradient(model)
|
||||||
|
optim.step()
|
||||||
|
loss_history.append(loss.item())
|
||||||
|
|
||||||
|
if idx % opt.log_interval == 0:
|
||||||
|
interval_loss = np.mean(loss_history[-opt.log_interval:])
|
||||||
|
print(
|
||||||
|
f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
|
||||||
|
|
||||||
|
mean_loss = np.mean(interval_loss)
|
||||||
|
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
|
||||||
|
return mean_loss
|
||||||
|
|
||||||
|
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix):
|
||||||
|
print('# Validating model ...')
|
||||||
|
loss_history = []
|
||||||
|
model.eval()
|
||||||
|
langs = lang_ids.keys()
|
||||||
|
id_2_lang = {v:k for k,v in lang_ids.items()}
|
||||||
|
predictions = {l: [] for l in langs}
|
||||||
|
yte_stacked = {l: [] for l in langs}
|
||||||
|
|
||||||
|
for batch, target, lang_idx in test_dataloader:
|
||||||
|
out = model(batch.cuda())
|
||||||
|
logits = out[0]
|
||||||
|
loss = criterion(logits, target.cuda()).item()
|
||||||
|
prediction = predict(logits)
|
||||||
|
loss_history.append(loss)
|
||||||
|
|
||||||
|
# Assigning prediction to dict in predictionS and yte_stacked according to lang_idx
|
||||||
|
for i, pred in enumerate(prediction):
|
||||||
|
lang_pred = id_2_lang[lang_idx.numpy()[i]]
|
||||||
|
predictions[lang_pred].append(pred)
|
||||||
|
yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
|
||||||
|
|
||||||
|
ly = {l: np.vstack(yte_stacked[l]) for l in langs}
|
||||||
|
ly_ = {l: np.vstack(predictions[l]) for l in langs}
|
||||||
|
l_eval = evaluate(ly, ly_)
|
||||||
|
metrics = []
|
||||||
|
for lang in langs:
|
||||||
|
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||||
|
metrics.append([macrof1, microf1, macrok, microk])
|
||||||
|
if measure_prefix == 'te':
|
||||||
|
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||||
|
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
||||||
|
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
||||||
|
|
||||||
|
mean_loss = np.mean(loss_history)
|
||||||
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
|
||||||
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
|
||||||
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
|
||||||
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
|
||||||
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
|
||||||
|
|
||||||
|
return Mf1
|
||||||
|
|
||||||
|
def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
|
||||||
|
l_split_va = l_tokenized_tr
|
||||||
|
l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
|
||||||
|
l_split_tr = l_tokenized_tr
|
||||||
|
l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
|
||||||
|
|
||||||
|
for lang in l_tokenized_tr.keys():
|
||||||
|
val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
|
||||||
|
|
||||||
|
l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[lang] = \
|
||||||
|
train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, random_state=seed, shuffle=True)
|
||||||
|
|
||||||
|
return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print('Running main ...')
|
||||||
|
|
||||||
|
DATAPATH = opt.dataset
|
||||||
|
method_name = set_method_name()
|
||||||
|
logfile = init_logfile(method_name, opt)
|
||||||
|
|
||||||
|
l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
|
||||||
|
l_tokenized_tr = do_tokenization(l_devel_raw, max_len=512)
|
||||||
|
|
||||||
|
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop=0.2, max_val=2000, seed=opt.seed)
|
||||||
|
|
||||||
|
l_tokenized_te = do_tokenization(l_test_raw, max_len=512)
|
||||||
|
|
||||||
|
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
|
||||||
|
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
|
||||||
|
te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
|
||||||
|
|
||||||
|
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
|
||||||
|
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=False)
|
||||||
|
te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False)
|
||||||
|
|
||||||
|
# Initializing model
|
||||||
|
model = get_model(73)
|
||||||
|
model = model.cuda()
|
||||||
|
criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||||
|
optim = init_optimizer(model, lr=opt.lr)
|
||||||
|
# lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
|
||||||
|
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
|
||||||
|
checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_dataset_name(opt.dataset)}')
|
||||||
|
# lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optim, num_warmup_steps= , num_training_steps=)
|
||||||
|
# print(model)
|
||||||
|
|
||||||
|
# Freezing encoder
|
||||||
|
# model = freeze_encoder(model)
|
||||||
|
check_param_grad_status(model)
|
||||||
|
|
||||||
|
# Training loop
|
||||||
|
tinit = time()
|
||||||
|
lang_ids = va_dataset.lang_ids
|
||||||
|
for epoch in range(1, opt.nepochs+1):
|
||||||
|
print('# Start Training ...')
|
||||||
|
train(model, tr_dataloader, epoch, criterion, optim, 'TestingBert', tinit, logfile)
|
||||||
|
# lr_scheduler.step(epoch=None) # reduces the learning rate
|
||||||
|
|
||||||
|
# validation
|
||||||
|
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
|
||||||
|
early_stop(macrof1, epoch)
|
||||||
|
if opt.test_each>0:
|
||||||
|
if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
|
||||||
|
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
|
||||||
|
|
||||||
|
if early_stop.STOP:
|
||||||
|
print('[early-stop] STOP')
|
||||||
|
if not opt.plotmode:
|
||||||
|
break
|
||||||
|
|
||||||
|
if opt.plotmode==False:
|
||||||
|
print('-' * 80)
|
||||||
|
print('Training over. Performing final evaluation')
|
||||||
|
|
||||||
|
model = early_stop.restore_checkpoint()
|
||||||
|
|
||||||
|
if opt.val_epochs>0:
|
||||||
|
print(f'running last {opt.val_epochs} training epochs on the validation set')
|
||||||
|
for val_epoch in range(1, opt.val_epochs + 1):
|
||||||
|
train(model, va_dataloader, epoch+val_epoch, criterion, optim, 'TestingBert', tinit, logfile)
|
||||||
|
|
||||||
|
# final test
|
||||||
|
print('Training complete: testing')
|
||||||
|
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
|
||||||
|
|
||||||
|
exit('Code Executed!')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model')
|
||||||
|
|
||||||
|
parser.add_argument('--dataset', type=str, default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
|
||||||
|
metavar='datasetpath', help=f'path to the pickled dataset')
|
||||||
|
parser.add_argument('--nepochs', type=int, default=200, metavar='int',
|
||||||
|
help='number of epochs (default: 200)')
|
||||||
|
parser.add_argument('--lr', type=float, default=2e-5, metavar='float',
|
||||||
|
help='learning rate (default: 2e-5)')
|
||||||
|
parser.add_argument('--weight_decay', type=float, default=0, metavar='float',
|
||||||
|
help='weight decay (default: 0)')
|
||||||
|
parser.add_argument('--patience', type=int, default=10, metavar='int',
|
||||||
|
help='patience for early-stop (default: 10)')
|
||||||
|
parser.add_argument('--log-interval', type=int, default=20, metavar='int',
|
||||||
|
help='how many batches to wait before printing training status')
|
||||||
|
parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str',
|
||||||
|
help='path to the log csv file')
|
||||||
|
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
|
||||||
|
parser.add_argument('--force', action='store_true', default=False,
|
||||||
|
help='do not check if this experiment has already been run')
|
||||||
|
parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str',
|
||||||
|
help='path to the directory containing checkpoints')
|
||||||
|
parser.add_argument('--plotmode', action='store_true', default=False,
|
||||||
|
help='in plot mode executes a long run in order '
|
||||||
|
'to generate enough data to produce trend plots (test-each should be >0. This mode is '
|
||||||
|
'used to produce plots, and does not perform an evaluation on the test set.')
|
||||||
|
parser.add_argument('--test-each', type=int, default=0, metavar='int',
|
||||||
|
help='how many epochs to wait before invoking test (default: 0, only at the end)')
|
||||||
|
parser.add_argument('--val-epochs', type=int, default=1, metavar='int',
|
||||||
|
help='number of training epochs to perform on the validation set once training is over (default 1)')
|
||||||
|
opt = parser.parse_args()
|
||||||
|
|
||||||
|
# Testing different parameters ...
|
||||||
|
opt.weight_decay = 0.01
|
||||||
|
opt.patience = 5
|
||||||
|
|
||||||
|
main()
|
||||||
|
# TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size
|
||||||
|
|
@ -1,7 +1,11 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t')
|
# df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t')
|
||||||
pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'embed'], aggfunc=[np.mean, np.std])
|
df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t')
|
||||||
print(pivot)
|
pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std])
|
||||||
|
with pd.option_context('display.max_rows', None):
|
||||||
|
print(pivot.round(3))
|
||||||
print('Finished ...')
|
print('Finished ...')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,11 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||||
|
logfile=../log/log_Mbert_rcv.csv
|
||||||
|
|
||||||
|
runs='0 1 2 3 4 5 6 7 8 9'
|
||||||
|
for run in $runs
|
||||||
|
do
|
||||||
|
dataset=$dataset_path$run.pickle
|
||||||
|
python new_mbert.py --dataset $dataset --log-file $logfile --test-each 20
|
||||||
|
done
|
||||||
|
|
@ -17,7 +17,7 @@ def get_weighted_average(We, x, w):
|
||||||
|
|
||||||
def compute_pc(X,npc=1):
|
def compute_pc(X,npc=1):
|
||||||
"""
|
"""
|
||||||
Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
|
Compute the principal components.
|
||||||
:param X: X[i,:] is a data point
|
:param X: X[i,:] is a data point
|
||||||
:param npc: number of principal components to remove
|
:param npc: number of principal components to remove
|
||||||
:return: component_[i,:] is the i-th pc
|
:return: component_[i,:] is the i-th pc
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import warnings
|
import warnings
|
||||||
|
import time
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
@ -143,6 +144,15 @@ class Index:
|
||||||
|
|
||||||
embedding_parts.append(F)
|
embedding_parts.append(F)
|
||||||
|
|
||||||
|
make_dumps = False
|
||||||
|
if make_dumps:
|
||||||
|
print(f'Dumping Embedding Matrices ...')
|
||||||
|
import pickle
|
||||||
|
with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile:
|
||||||
|
pickle.dump((self.lang, embedding_parts, self.word2index), outfile)
|
||||||
|
with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2:
|
||||||
|
pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2)
|
||||||
|
|
||||||
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
|
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
|
||||||
|
|
||||||
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
|
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
|
||||||
|
|
@ -155,6 +165,7 @@ class MultilingualIndex:
|
||||||
def __init__(self): #, add_language_trace=False):
|
def __init__(self): #, add_language_trace=False):
|
||||||
self.l_index = {}
|
self.l_index = {}
|
||||||
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||||
|
# self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000)
|
||||||
# self.add_language_trace=add_language_trace
|
# self.add_language_trace=add_language_trace
|
||||||
|
|
||||||
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
|
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
|
||||||
|
|
@ -189,30 +200,42 @@ class MultilingualIndex:
|
||||||
# pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1)
|
# pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1)
|
||||||
|
|
||||||
|
|
||||||
def posterior_probabilities(self, max_training_docs_by_lang=5000):
|
def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
|
||||||
# choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
|
# choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
|
||||||
|
timeit = time.time()
|
||||||
lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
|
lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
|
||||||
lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
|
lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
|
||||||
for l in self.langs:
|
if not stored_post:
|
||||||
n_elements = lXtr[l].shape[0]
|
for l in self.langs:
|
||||||
if n_elements > max_training_docs_by_lang:
|
n_elements = lXtr[l].shape[0]
|
||||||
choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
|
if n_elements > max_training_docs_by_lang:
|
||||||
lXtr[l] = lXtr[l][choice]
|
choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
|
||||||
lYtr[l] = lYtr[l][choice]
|
lXtr[l] = lXtr[l][choice]
|
||||||
|
lYtr[l] = lYtr[l][choice]
|
||||||
|
|
||||||
# train the posterior probabilities embedder
|
# train the posterior probabilities embedder
|
||||||
print('[posteriors] training a calibrated SVM')
|
print('[posteriors] training a calibrated SVM')
|
||||||
learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
|
learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
|
||||||
prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
|
prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
|
||||||
prob_embedder.fit(lXtr, lYtr)
|
prob_embedder.fit(lXtr, lYtr)
|
||||||
|
|
||||||
# transforms the training, validation, and test sets into posterior probabilities
|
# transforms the training, validation, and test sets into posterior probabilities
|
||||||
print('[posteriors] generating posterior probabilities')
|
print('[posteriors] generating posterior probabilities')
|
||||||
lPtr = prob_embedder.transform(self.get_lXtr())
|
lPtr = prob_embedder.transform(self.get_lXtr())
|
||||||
lPva = prob_embedder.transform(self.get_lXva())
|
lPva = prob_embedder.transform(self.get_lXva())
|
||||||
lPte = prob_embedder.transform(self.get_lXte())
|
lPte = prob_embedder.transform(self.get_lXte())
|
||||||
|
# NB: Check splits indices !
|
||||||
print('[posteriors] done')
|
if store_posteriors:
|
||||||
|
import pickle
|
||||||
|
with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
|
||||||
|
pickle.dump([lPtr, lPva, lPte], outfile)
|
||||||
|
print(f'Successfully dumped posteriors!')
|
||||||
|
else:
|
||||||
|
import pickle
|
||||||
|
with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
|
||||||
|
lPtr, lPva, lPte = pickle.load(infile)
|
||||||
|
print(f'Successfully loaded stored posteriors!')
|
||||||
|
print(f'[posteriors] done in {time.time() - timeit}')
|
||||||
return lPtr, lPva, lPte
|
return lPtr, lPva, lPte
|
||||||
|
|
||||||
def get_lXtr(self):
|
def get_lXtr(self):
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ from util.file import create_if_not_exist
|
||||||
|
|
||||||
class EarlyStopping:
|
class EarlyStopping:
|
||||||
|
|
||||||
def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
|
def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
|
||||||
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
|
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
|
||||||
self.patience_limit = patience
|
self.patience_limit = patience
|
||||||
self.patience = patience
|
self.patience = patience
|
||||||
|
|
@ -16,9 +16,10 @@ class EarlyStopping:
|
||||||
self.stop_time = None
|
self.stop_time = None
|
||||||
self.checkpoint = checkpoint
|
self.checkpoint = checkpoint
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self.optimizer = optimizer
|
||||||
self.STOP = False
|
self.STOP = False
|
||||||
|
|
||||||
def __call__(self, watch_score, epoch):
|
def __call__(self, watch_score, epoch): #model
|
||||||
|
|
||||||
if self.STOP: return #done
|
if self.STOP: return #done
|
||||||
|
|
||||||
|
|
@ -29,6 +30,9 @@ class EarlyStopping:
|
||||||
if self.checkpoint:
|
if self.checkpoint:
|
||||||
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
|
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
|
||||||
torch.save(self.model, self.checkpoint)
|
torch.save(self.model, self.checkpoint)
|
||||||
|
# with open(self.checkpoint)
|
||||||
|
# torch.save({'state_dict': self.model.state_dict(),
|
||||||
|
# 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint)
|
||||||
else:
|
else:
|
||||||
self.print(f'[early-stop] improved')
|
self.print(f'[early-stop] improved')
|
||||||
self.patience = self.patience_limit
|
self.patience = self.patience_limit
|
||||||
|
|
@ -46,6 +50,7 @@ class EarlyStopping:
|
||||||
self.patience=self.patience_limit
|
self.patience=self.patience_limit
|
||||||
|
|
||||||
def restore_checkpoint(self):
|
def restore_checkpoint(self):
|
||||||
|
print(f'restoring best model from epoch {self.best_epoch}...')
|
||||||
return torch.load(self.checkpoint)
|
return torch.load(self.checkpoint)
|
||||||
|
|
||||||
def print(self, msg):
|
def print(self, msg):
|
||||||
|
|
|
||||||
|
|
@ -5,8 +5,23 @@ import numpy as np
|
||||||
class PolylingualClassificationResults:
|
class PolylingualClassificationResults:
|
||||||
def __init__(self, file, autoflush=True, verbose=False):
|
def __init__(self, file, autoflush=True, verbose=False):
|
||||||
self.file = file
|
self.file = file
|
||||||
self.columns = ['method', 'learner', 'embed', 'embed_type', 'pca_s', 'pca_u', 'optimp', 'dataset', 'time',
|
self.columns = ['method',
|
||||||
'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
|
'learner',
|
||||||
|
'optimp',
|
||||||
|
'sif',
|
||||||
|
'zscore',
|
||||||
|
'l2',
|
||||||
|
'wescaler',
|
||||||
|
'pca',
|
||||||
|
'id',
|
||||||
|
'dataset',
|
||||||
|
'time',
|
||||||
|
'lang',
|
||||||
|
'macrof1',
|
||||||
|
'microf1',
|
||||||
|
'macrok',
|
||||||
|
'microk',
|
||||||
|
'notes']
|
||||||
self.autoflush = autoflush
|
self.autoflush = autoflush
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
if os.path.exists(file):
|
if os.path.exists(file):
|
||||||
|
|
@ -21,8 +36,8 @@ class PolylingualClassificationResults:
|
||||||
def already_calculated(self, id):
|
def already_calculated(self, id):
|
||||||
return (self.df['id'] == id).any()
|
return (self.df['id'] == id).any()
|
||||||
|
|
||||||
def add_row(self, method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||||
s = pd.Series([method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
||||||
self.df = self.df.append(s, ignore_index=True)
|
self.df = self.df.append(s, ignore_index=True)
|
||||||
if self.autoflush: self.flush()
|
if self.autoflush: self.flush()
|
||||||
self.tell(s.to_string())
|
self.tell(s.to_string())
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue