rsc branch;

load pre-computed VGs' output if already stored in memory
This commit is contained in:
andrea 2020-11-19 14:30:10 +01:00
parent 8af763b130
commit 515acae15b
4 changed files with 92 additions and 58 deletions

View File

@ -13,9 +13,10 @@ from scipy.sparse import csr_matrix
from models.mBert import * from models.mBert import *
from models.lstm_class import * from models.lstm_class import *
from util.csv_log import CSVLog from util.csv_log import CSVLog
from util.file import get_file_name from util.file import get_file_name, create_if_not_exist, exists
from util.early_stop import EarlyStopping from util.early_stop import EarlyStopping
from util.common import * from util.common import *
import pickle
import time import time
@ -54,7 +55,6 @@ class FeatureWeight:
elif self.agg == 'mean': elif self.agg == 'mean':
F = tsr_matrix.mean(axis=0) F = tsr_matrix.mean(axis=0)
self.lF[l] = F self.lF[l] = F
self.fitted = True self.fitted = True
return self return self
@ -71,7 +71,7 @@ class FeatureWeight:
class PosteriorProbabilitiesEmbedder: class PosteriorProbabilitiesEmbedder:
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1): def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1, is_training=True, storing_path='../dumps/'):
self.fist_tier_learner = first_tier_learner self.fist_tier_learner = first_tier_learner
self.fist_tier_parameters = first_tier_parameters self.fist_tier_parameters = first_tier_parameters
self.l2 = l2 self.l2 = l2
@ -80,8 +80,13 @@ class PosteriorProbabilitiesEmbedder:
self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
) )
self.requires_tfidf = True self.requires_tfidf = True
self.storing_path = storing_path
self.is_training = is_training
def fit(self, lX, lY, lV=None, called_by_viewgen=False): def fit(self, lX, lY, lV=None, called_by_viewgen=False):
if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
return self
if not called_by_viewgen: if not called_by_viewgen:
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen) # Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
print('### Posterior Probabilities View Generator (X)') print('### Posterior Probabilities View Generator (X)')
@ -90,8 +95,22 @@ class PosteriorProbabilitiesEmbedder:
return self return self
def transform(self, lX): def transform(self, lX):
# if dir exist, load and return already computed results
_endpoint = 'tr' if self.is_training else 'te'
_actual_path = self.storing_path + '/' + _endpoint
if exists(_actual_path):
print('NB: loading pre-computed results!')
with open(_actual_path + '/X.pickle', 'rb') as infile:
self.is_training = False
return pickle.load(infile)
lZ = self.predict_proba(lX) lZ = self.predict_proba(lX)
lZ = _normalize(lZ, self.l2) lZ = _normalize(lZ, self.l2)
# create dir and dump computed results
create_if_not_exist(_actual_path)
with open(_actual_path + '/X.pickle', 'wb') as outfile:
pickle.dump(lZ, outfile)
self.is_training = False
return lZ return lZ
def fit_transform(self, lX, ly=None, lV=None): def fit_transform(self, lX, ly=None, lV=None):
@ -105,10 +124,8 @@ class PosteriorProbabilitiesEmbedder:
def predict_proba(self, lX, ly=None): def predict_proba(self, lX, ly=None):
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents') print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents')
return self.doc_projector.predict_proba(lX) lZ = self.doc_projector.predict_proba(lX)
return lZ
def _get_output_dim(self):
return len(self.doc_projector.model['da'].model.classes_)
class MuseEmbedder: class MuseEmbedder:
@ -222,8 +239,8 @@ class MBertEmbedder:
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
va_dataset = TrainingDataset(l_split_va, l_split_val_target) va_dataset = TrainingDataset(l_split_va, l_split_val_target)
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True) tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=True)
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True) va_dataloader = DataLoader(va_dataset, batch_size=64, shuffle=True)
nC = tr_dataset.get_nclasses() nC = tr_dataset.get_nclasses()
model = get_model(nC) model = get_model(nC)
@ -272,7 +289,7 @@ class MBertEmbedder:
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True) l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
feat_dataset = ExtractorDataset(l_tokenized_X) feat_dataset = ExtractorDataset(l_tokenized_X)
feat_lang_ids = feat_dataset.lang_ids feat_lang_ids = feat_dataset.lang_ids
dataloader = DataLoader(feat_dataset, batch_size=64) dataloader = DataLoader(feat_dataset, batch_size=64) # TODO reduced batch size in JRC experiments
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model) all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
return all_batch_embeddings return all_batch_embeddings
@ -326,15 +343,8 @@ class RecurrentEmbedder:
self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience, self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience,
checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}') checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}')
def fit(self, lX, ly, lV=None, batch_size=128, nepochs=200, val_epochs=1):
# Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities
self.posteriorEmbedder = MetaClassifier(
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs)
def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1):
print('### Gated Recurrent Unit View Generator (G)') print('### Gated Recurrent Unit View Generator (G)')
# self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
# could be better to init model here at first .fit() call!
if self.model is None: if self.model is None:
print('TODO: Init model!') print('TODO: Init model!')
if not self.is_trained: if not self.is_trained:
@ -358,7 +368,7 @@ class RecurrentEmbedder:
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim, tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None, epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
ltrain_bert=None) ltrain_bert=None)
self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch? self.lr_scheduler.step()
# validation step # validation step
macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch, macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch,
@ -384,21 +394,15 @@ class RecurrentEmbedder:
ltrain_bert=None) ltrain_bert=None)
self.is_trained = True self.is_trained = True
# Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities
# lX = self._get_doc_embeddings(lX)
lX = self._get_doc_embeddings(self.multilingual_index.l_devel_index())
# Fit a ''multi-lingual'' SVM on the generated doc embeddings
self.posteriorEmbedder.fit(lX, ly)
return self return self
def transform(self, lX, batch_size=64): def transform(self, lX, batch_size=64):
lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary) lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
lX = self._get_doc_embeddings(lX) lX = self._get_doc_embeddings(lX)
return self.posteriorEmbedder.predict_proba(lX) return lX
def fit_transform(self, lX, ly, lV=None): def fit_transform(self, lX, ly, lV=None):
# TODO return self.fit(lX, ly).transform(lX)
return 0
def _get_doc_embeddings(self, lX, batch_size=64): def _get_doc_embeddings(self, lX, batch_size=64):
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!' assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
@ -418,7 +422,7 @@ class RecurrentEmbedder:
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise # loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
def _load_pretrained_embeddings(self, we_path, langs): def _load_pretrained_embeddings(self, we_path, langs):
lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ? lpretrained = lpretrained_vocabulary = self._none_dict(langs)
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1) lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
return lpretrained, lpretrained_vocabulary return lpretrained, lpretrained_vocabulary
@ -495,26 +499,15 @@ class DocEmbedderList:
return self.embedders[0].transform(lX) return self.embedders[0].transform(lX)
langs = sorted(lX.keys()) langs = sorted(lX.keys())
lZparts = {l: None for l in langs} lZparts = {l: None for l in langs}
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
min_dim = 73 # TODO <---- this should be the number of target classes
for transformer in self.embedders: for transformer in self.embedders:
_lX = lX _lX = lX
if transformer.requires_tfidf: if transformer.requires_tfidf:
_lX = tfidf _lX = tfidf
lZ = transformer.transform(_lX) lZ = transformer.transform(_lX)
nC = min([lZ[lang].shape[1] for lang in langs])
for l in langs: for l in langs:
Z = lZ[l] Z = lZ[l]
if Z.shape[1] > min_dim:
print(
f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
f'Applying PCA(n_components={min_dim})')
pca = PCA(n_components=min_dim)
Z = pca.fit(Z).transform(Z)
if lZparts[l] is None: if lZparts[l] is None:
lZparts[l] = Z lZparts[l] = Z
else: else:
@ -535,7 +528,7 @@ class DocEmbedderList:
class FeatureSet2Posteriors: class FeatureSet2Posteriors:
def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1): def __init__(self, transformer, method_id, requires_tfidf=False, l2=True, n_jobs=-1, storing_path='../dumps/'):
self.transformer = transformer self.transformer = transformer
self.l2 = l2 self.l2 = l2
self.n_jobs = n_jobs self.n_jobs = n_jobs
@ -543,7 +536,15 @@ class FeatureSet2Posteriors:
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
self.requires_tfidf = requires_tfidf self.requires_tfidf = requires_tfidf
self.storing_path = storing_path
self.is_training = True
self.method_id = method_id
def fit(self, lX, ly, lV=None): def fit(self, lX, ly, lV=None):
if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
return self
if lV is None and hasattr(self.transformer, 'lV'): if lV is None and hasattr(self.transformer, 'lV'):
lV = self.transformer.lV lV = self.transformer.lV
lZ = self.transformer.fit_transform(lX, ly, lV) lZ = self.transformer.fit_transform(lX, ly, lV)
@ -551,8 +552,22 @@ class FeatureSet2Posteriors:
return self return self
def transform(self, lX): def transform(self, lX):
# if dir exist, load and return already computed results
_endpoint = 'tr' if self.is_training else 'te'
_actual_path = self.storing_path + '/' + _endpoint
if exists(_actual_path):
print('NB: loading pre-computed results!')
with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile:
self.is_training = False
return pickle.load(infile)
lP = self.predict_proba(lX) lP = self.predict_proba(lX)
lP = _normalize(lP, self.l2) lP = _normalize(lP, self.l2)
# create dir and dump computed results
create_if_not_exist(_actual_path)
with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile:
pickle.dump(lP, outfile)
self.is_training = False
return lP return lP
def fit_transform(self, lX, ly, lV): def fit_transform(self, lX, ly, lV):
@ -691,7 +706,7 @@ def word_class_embedding_matrix(X, Y, max_label_space=300):
def XdotM(X, M, sif): def XdotM(X, M, sif):
E = X.dot(M) E = X.dot(M)
if sif: if sif:
print("removing pc...") # print("removing pc...")
E = remove_pc(E, npc=1) E = remove_pc(E, npc=1)
return E return E
@ -714,7 +729,7 @@ class BatchGRU:
def batchify(self, l_index, l_post, l_bert, llabels, extractor=False): def batchify(self, l_index, l_post, l_bert, llabels, extractor=False):
langs = self.languages langs = self.languages
l_num_samples = {l:len(l_index[l]) for l in langs} l_num_samples = {l: len(l_index[l]) for l in langs}
max_samples = max(l_num_samples.values()) max_samples = max(l_num_samples.values())
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0) n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)

View File

@ -36,7 +36,7 @@ if __name__ == '__main__':
# load dataset # load dataset
data = MultilingualDataset.load(dataset) data = MultilingualDataset.load(dataset)
# data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING
data.show_dimensions() data.show_dimensions()
lXtr, lytr = data.training() lXtr, lytr = data.training()
lXte, lyte = data.test() lXte, lyte = data.test()
@ -56,18 +56,26 @@ if __name__ == '__main__':
View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means
of a set of SVM. of a set of SVM.
""" """
# Check if we already have VG outputs from previous runs
VG_name = 'X'
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
exist = exists(storing_path)
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
kernel='linear', kernel='linear',
C=op.set_c), l2=l2)) C=op.set_c),
l2=l2, storing_path=storing_path))
if op.supervised: if op.supervised:
""" """
View Generator (-W): generates document representation via Word-Class-Embeddings. View Generator (-W): generates document representation via Word-Class-Embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings. Document embeddings are obtained via weighted sum of document's constituent embeddings.
""" """
VG_name = 'W'
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
exist = exists(storing_path)
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif) wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
if op.allprob: if op.allprob:
wce = FeatureSet2Posteriors(wce, requires_tfidf=True, l2=l2) wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
doc_embedder.append(wce) doc_embedder.append(wce)
if op.pretrained: if op.pretrained:
@ -75,30 +83,41 @@ if __name__ == '__main__':
View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
""" """
VG_name = 'M'
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
exist = exists(storing_path)
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif) muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
if op.allprob: if op.allprob:
muse = FeatureSet2Posteriors(muse, requires_tfidf=True, l2=l2) muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
doc_embedder.append(muse) doc_embedder.append(muse)
if op.gruViewGenerator: if op.gruViewGenerator:
""" """
View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Such initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
document embeddings are then casted into vectors of posterior probabilities via a set of SVM. Output dimension is (n_docs, 512). If --allprob output will be casted to posterior prob space via SVM.
NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob
""" """
VG_name = 'G'
VG_name += '_muse' if op.gruMUSE else ''
VG_name += '_wce' if op.gruWCE else ''
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data, rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
options=op, model_path=op.gru_path) options=op, model_path=op.gru_path)
if op.allprob:
rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False,
storing_path=storing_path)
doc_embedder.append(rnn_embedder) doc_embedder.append(rnn_embedder)
if op.mbert: if op.mbert:
""" """
View generator (-B): generates document embedding via mBERT model. View generator (-B): generates document embedding via mBERT model.
""" """
mbert = MBertEmbedder(path_to_model=op.bert_path, VG_name = 'B'
nC=data.num_categories()) storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories())
if op.allprob: if op.allprob:
mbert = FeatureSet2Posteriors(mbert, l2=l2) mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path)
doc_embedder.append(mbert) doc_embedder.append(mbert)
# metaclassifier # metaclassifier

View File

@ -5,6 +5,7 @@ from transformers import BertForSequenceClassification, BertTokenizer, AdamW, Be
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from util.evaluation import * from util.evaluation import *
from time import time from time import time
from util.common import show_gpu
def predict(logits, classification_type='multilabel'): def predict(logits, classification_type='multilabel'):
@ -21,7 +22,6 @@ def predict(logits, classification_type='multilabel'):
class TrainingDataset(Dataset): class TrainingDataset(Dataset):
""" """
data: dict of lang specific tokenized data data: dict of lang specific tokenized data
labels: dict of lang specific targets
""" """
def __init__(self, data, labels): def __init__(self, data, labels):
@ -156,7 +156,7 @@ def do_tokenization(l_dataset, max_len=512, verbose=True):
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10): def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10):
# _dataset_path = opt.dataset.split('/')[-1].split('_') # _dataset_path = opt.dataset.split('/')[-1].split('_')
# dataset_id = _dataset_path[0] + _dataset_path[-1] # dataset_id = _dataset_path[0] + _dataset_path[-1]
dataset_id = 'TODO fix this!' dataset_id = 'TODO fix this!' # TODO
loss_history = [] loss_history = []
model.train() model.train()
@ -231,12 +231,13 @@ def feature_extractor(data, lang_ids, model):
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
the output of each layer) of shape (batch_size, sequence_length, hidden_size) the output of each layer) of shape (batch_size, sequence_length, hidden_size)
""" """
show_gpu('Before Training')
all_batch_embeddings = {} all_batch_embeddings = {}
id2lang = {v: k for k, v in lang_ids.items()} id2lang = {v: k for k, v in lang_ids.items()}
with torch.no_grad(): with torch.no_grad():
for batch, lang_idx in data: for batch, lang_idx in data:
# for batch, target, lang_idx in data:
out = model(batch.cuda()) out = model(batch.cuda())
# show_gpu('After Batch Prediction')
last_hidden_state = out[1][-1] last_hidden_state = out[1][-1]
batch_embeddings = last_hidden_state[:, 0, :] batch_embeddings = last_hidden_state[:, 0, :]
for i, l_idx in enumerate(lang_idx.numpy()): for i, l_idx in enumerate(lang_idx.numpy()):
@ -245,5 +246,5 @@ def feature_extractor(data, lang_ids, model):
else: else:
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
batch_embeddings[i].detach().cpu().numpy())) batch_embeddings[i].detach().cpu().numpy()))
show_gpu('After Full Prediction')
return all_batch_embeddings, id2lang return all_batch_embeddings, id2lang

View File

@ -4,7 +4,6 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from embeddings.supervised import get_supervised_embeddings from embeddings.supervised import get_supervised_embeddings
# from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
import torch import torch