rsc branch;

load pre-computed VGs' output if already stored in memory
This commit is contained in:
andrea 2020-11-19 14:30:10 +01:00
parent 8af763b130
commit 515acae15b
4 changed files with 92 additions and 58 deletions

View File

@ -13,9 +13,10 @@ from scipy.sparse import csr_matrix
from models.mBert import *
from models.lstm_class import *
from util.csv_log import CSVLog
from util.file import get_file_name
from util.file import get_file_name, create_if_not_exist, exists
from util.early_stop import EarlyStopping
from util.common import *
import pickle
import time
@ -54,7 +55,6 @@ class FeatureWeight:
elif self.agg == 'mean':
F = tsr_matrix.mean(axis=0)
self.lF[l] = F
self.fitted = True
return self
@ -71,7 +71,7 @@ class FeatureWeight:
class PosteriorProbabilitiesEmbedder:
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1):
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1, is_training=True, storing_path='../dumps/'):
self.fist_tier_learner = first_tier_learner
self.fist_tier_parameters = first_tier_parameters
self.l2 = l2
@ -80,8 +80,13 @@ class PosteriorProbabilitiesEmbedder:
self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
)
self.requires_tfidf = True
self.storing_path = storing_path
self.is_training = is_training
def fit(self, lX, lY, lV=None, called_by_viewgen=False):
if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
return self
if not called_by_viewgen:
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
print('### Posterior Probabilities View Generator (X)')
@ -90,8 +95,22 @@ class PosteriorProbabilitiesEmbedder:
return self
def transform(self, lX):
# if dir exist, load and return already computed results
_endpoint = 'tr' if self.is_training else 'te'
_actual_path = self.storing_path + '/' + _endpoint
if exists(_actual_path):
print('NB: loading pre-computed results!')
with open(_actual_path + '/X.pickle', 'rb') as infile:
self.is_training = False
return pickle.load(infile)
lZ = self.predict_proba(lX)
lZ = _normalize(lZ, self.l2)
# create dir and dump computed results
create_if_not_exist(_actual_path)
with open(_actual_path + '/X.pickle', 'wb') as outfile:
pickle.dump(lZ, outfile)
self.is_training = False
return lZ
def fit_transform(self, lX, ly=None, lV=None):
@ -105,10 +124,8 @@ class PosteriorProbabilitiesEmbedder:
def predict_proba(self, lX, ly=None):
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents')
return self.doc_projector.predict_proba(lX)
def _get_output_dim(self):
return len(self.doc_projector.model['da'].model.classes_)
lZ = self.doc_projector.predict_proba(lX)
return lZ
class MuseEmbedder:
@ -222,8 +239,8 @@ class MBertEmbedder:
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=True)
va_dataloader = DataLoader(va_dataset, batch_size=64, shuffle=True)
nC = tr_dataset.get_nclasses()
model = get_model(nC)
@ -272,7 +289,7 @@ class MBertEmbedder:
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
feat_dataset = ExtractorDataset(l_tokenized_X)
feat_lang_ids = feat_dataset.lang_ids
dataloader = DataLoader(feat_dataset, batch_size=64)
dataloader = DataLoader(feat_dataset, batch_size=64) # TODO reduced batch size in JRC experiments
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
return all_batch_embeddings
@ -326,15 +343,8 @@ class RecurrentEmbedder:
self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience,
checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}')
# Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities
self.posteriorEmbedder = MetaClassifier(
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs)
def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1):
def fit(self, lX, ly, lV=None, batch_size=128, nepochs=200, val_epochs=1):
print('### Gated Recurrent Unit View Generator (G)')
# self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
# could be better to init model here at first .fit() call!
if self.model is None:
print('TODO: Init model!')
if not self.is_trained:
@ -358,7 +368,7 @@ class RecurrentEmbedder:
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
ltrain_bert=None)
self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
self.lr_scheduler.step()
# validation step
macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch,
@ -384,21 +394,15 @@ class RecurrentEmbedder:
ltrain_bert=None)
self.is_trained = True
# Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities
# lX = self._get_doc_embeddings(lX)
lX = self._get_doc_embeddings(self.multilingual_index.l_devel_index())
# Fit a ''multi-lingual'' SVM on the generated doc embeddings
self.posteriorEmbedder.fit(lX, ly)
return self
def transform(self, lX, batch_size=64):
lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
lX = self._get_doc_embeddings(lX)
return self.posteriorEmbedder.predict_proba(lX)
return lX
def fit_transform(self, lX, ly, lV=None):
# TODO
return 0
return self.fit(lX, ly).transform(lX)
def _get_doc_embeddings(self, lX, batch_size=64):
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
@ -418,7 +422,7 @@ class RecurrentEmbedder:
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
def _load_pretrained_embeddings(self, we_path, langs):
lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ?
lpretrained = lpretrained_vocabulary = self._none_dict(langs)
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
return lpretrained, lpretrained_vocabulary
@ -495,26 +499,15 @@ class DocEmbedderList:
return self.embedders[0].transform(lX)
langs = sorted(lX.keys())
lZparts = {l: None for l in langs}
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
min_dim = 73 # TODO <---- this should be the number of target classes
for transformer in self.embedders:
_lX = lX
if transformer.requires_tfidf:
_lX = tfidf
lZ = transformer.transform(_lX)
nC = min([lZ[lang].shape[1] for lang in langs])
for l in langs:
Z = lZ[l]
if Z.shape[1] > min_dim:
print(
f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
f'Applying PCA(n_components={min_dim})')
pca = PCA(n_components=min_dim)
Z = pca.fit(Z).transform(Z)
if lZparts[l] is None:
lZparts[l] = Z
else:
@ -535,7 +528,7 @@ class DocEmbedderList:
class FeatureSet2Posteriors:
def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1):
def __init__(self, transformer, method_id, requires_tfidf=False, l2=True, n_jobs=-1, storing_path='../dumps/'):
self.transformer = transformer
self.l2 = l2
self.n_jobs = n_jobs
@ -543,7 +536,15 @@ class FeatureSet2Posteriors:
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
self.requires_tfidf = requires_tfidf
self.storing_path = storing_path
self.is_training = True
self.method_id = method_id
def fit(self, lX, ly, lV=None):
if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
return self
if lV is None and hasattr(self.transformer, 'lV'):
lV = self.transformer.lV
lZ = self.transformer.fit_transform(lX, ly, lV)
@ -551,8 +552,22 @@ class FeatureSet2Posteriors:
return self
def transform(self, lX):
# if dir exist, load and return already computed results
_endpoint = 'tr' if self.is_training else 'te'
_actual_path = self.storing_path + '/' + _endpoint
if exists(_actual_path):
print('NB: loading pre-computed results!')
with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile:
self.is_training = False
return pickle.load(infile)
lP = self.predict_proba(lX)
lP = _normalize(lP, self.l2)
# create dir and dump computed results
create_if_not_exist(_actual_path)
with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile:
pickle.dump(lP, outfile)
self.is_training = False
return lP
def fit_transform(self, lX, ly, lV):
@ -691,7 +706,7 @@ def word_class_embedding_matrix(X, Y, max_label_space=300):
def XdotM(X, M, sif):
E = X.dot(M)
if sif:
print("removing pc...")
# print("removing pc...")
E = remove_pc(E, npc=1)
return E
@ -714,7 +729,7 @@ class BatchGRU:
def batchify(self, l_index, l_post, l_bert, llabels, extractor=False):
langs = self.languages
l_num_samples = {l:len(l_index[l]) for l in langs}
l_num_samples = {l: len(l_index[l]) for l in langs}
max_samples = max(l_num_samples.values())
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)

View File

@ -28,7 +28,7 @@ if __name__ == '__main__':
op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob)
print(f'Method: gFun{method_name}\nDataset: {dataset_name}')
print('-'*50)
# set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect
standardize_range = slice(0, 0)
if op.zscore:
@ -36,7 +36,7 @@ if __name__ == '__main__':
# load dataset
data = MultilingualDataset.load(dataset)
# data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING
data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING
data.show_dimensions()
lXtr, lytr = data.training()
lXte, lyte = data.test()
@ -56,18 +56,26 @@ if __name__ == '__main__':
View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means
of a set of SVM.
"""
# Check if we already have VG outputs from previous runs
VG_name = 'X'
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
exist = exists(storing_path)
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
kernel='linear',
C=op.set_c), l2=l2))
C=op.set_c),
l2=l2, storing_path=storing_path))
if op.supervised:
"""
View Generator (-W): generates document representation via Word-Class-Embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings.
"""
VG_name = 'W'
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
exist = exists(storing_path)
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
if op.allprob:
wce = FeatureSet2Posteriors(wce, requires_tfidf=True, l2=l2)
wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
doc_embedder.append(wce)
if op.pretrained:
@ -75,30 +83,41 @@ if __name__ == '__main__':
View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
"""
VG_name = 'M'
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
exist = exists(storing_path)
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
if op.allprob:
muse = FeatureSet2Posteriors(muse, requires_tfidf=True, l2=l2)
muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
doc_embedder.append(muse)
if op.gruViewGenerator:
"""
View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Such
document embeddings are then casted into vectors of posterior probabilities via a set of SVM.
NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
Output dimension is (n_docs, 512). If --allprob output will be casted to posterior prob space via SVM.
"""
VG_name = 'G'
VG_name += '_muse' if op.gruMUSE else ''
VG_name += '_wce' if op.gruWCE else ''
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
options=op, model_path=op.gru_path)
if op.allprob:
rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False,
storing_path=storing_path)
doc_embedder.append(rnn_embedder)
if op.mbert:
"""
View generator (-B): generates document embedding via mBERT model.
"""
mbert = MBertEmbedder(path_to_model=op.bert_path,
nC=data.num_categories())
VG_name = 'B'
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories())
if op.allprob:
mbert = FeatureSet2Posteriors(mbert, l2=l2)
mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path)
doc_embedder.append(mbert)
# metaclassifier

View File

@ -5,6 +5,7 @@ from transformers import BertForSequenceClassification, BertTokenizer, AdamW, Be
from sklearn.model_selection import train_test_split
from util.evaluation import *
from time import time
from util.common import show_gpu
def predict(logits, classification_type='multilabel'):
@ -21,7 +22,6 @@ def predict(logits, classification_type='multilabel'):
class TrainingDataset(Dataset):
"""
data: dict of lang specific tokenized data
labels: dict of lang specific targets
"""
def __init__(self, data, labels):
@ -156,7 +156,7 @@ def do_tokenization(l_dataset, max_len=512, verbose=True):
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10):
# _dataset_path = opt.dataset.split('/')[-1].split('_')
# dataset_id = _dataset_path[0] + _dataset_path[-1]
dataset_id = 'TODO fix this!'
dataset_id = 'TODO fix this!' # TODO
loss_history = []
model.train()
@ -231,12 +231,13 @@ def feature_extractor(data, lang_ids, model):
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
"""
show_gpu('Before Training')
all_batch_embeddings = {}
id2lang = {v: k for k, v in lang_ids.items()}
with torch.no_grad():
for batch, lang_idx in data:
# for batch, target, lang_idx in data:
out = model(batch.cuda())
# show_gpu('After Batch Prediction')
last_hidden_state = out[1][-1]
batch_embeddings = last_hidden_state[:, 0, :]
for i, l_idx in enumerate(lang_idx.numpy()):
@ -245,5 +246,5 @@ def feature_extractor(data, lang_ids, model):
else:
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
batch_embeddings[i].detach().cpu().numpy()))
show_gpu('After Full Prediction')
return all_batch_embeddings, id2lang

View File

@ -4,7 +4,6 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from embeddings.supervised import get_supervised_embeddings
# from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual
import numpy as np
from tqdm import tqdm
import torch