rsc branch;
load pre-computed VGs' output if already stored in memory
This commit is contained in:
parent
8af763b130
commit
515acae15b
|
@ -13,9 +13,10 @@ from scipy.sparse import csr_matrix
|
|||
from models.mBert import *
|
||||
from models.lstm_class import *
|
||||
from util.csv_log import CSVLog
|
||||
from util.file import get_file_name
|
||||
from util.file import get_file_name, create_if_not_exist, exists
|
||||
from util.early_stop import EarlyStopping
|
||||
from util.common import *
|
||||
import pickle
|
||||
import time
|
||||
|
||||
|
||||
|
@ -54,7 +55,6 @@ class FeatureWeight:
|
|||
elif self.agg == 'mean':
|
||||
F = tsr_matrix.mean(axis=0)
|
||||
self.lF[l] = F
|
||||
|
||||
self.fitted = True
|
||||
return self
|
||||
|
||||
|
@ -71,7 +71,7 @@ class FeatureWeight:
|
|||
|
||||
class PosteriorProbabilitiesEmbedder:
|
||||
|
||||
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1):
|
||||
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1, is_training=True, storing_path='../dumps/'):
|
||||
self.fist_tier_learner = first_tier_learner
|
||||
self.fist_tier_parameters = first_tier_parameters
|
||||
self.l2 = l2
|
||||
|
@ -80,8 +80,13 @@ class PosteriorProbabilitiesEmbedder:
|
|||
self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
|
||||
)
|
||||
self.requires_tfidf = True
|
||||
self.storing_path = storing_path
|
||||
self.is_training = is_training
|
||||
|
||||
def fit(self, lX, lY, lV=None, called_by_viewgen=False):
|
||||
if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
|
||||
print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
|
||||
return self
|
||||
if not called_by_viewgen:
|
||||
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
|
||||
print('### Posterior Probabilities View Generator (X)')
|
||||
|
@ -90,8 +95,22 @@ class PosteriorProbabilitiesEmbedder:
|
|||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
# if dir exist, load and return already computed results
|
||||
_endpoint = 'tr' if self.is_training else 'te'
|
||||
_actual_path = self.storing_path + '/' + _endpoint
|
||||
if exists(_actual_path):
|
||||
print('NB: loading pre-computed results!')
|
||||
with open(_actual_path + '/X.pickle', 'rb') as infile:
|
||||
self.is_training = False
|
||||
return pickle.load(infile)
|
||||
|
||||
lZ = self.predict_proba(lX)
|
||||
lZ = _normalize(lZ, self.l2)
|
||||
# create dir and dump computed results
|
||||
create_if_not_exist(_actual_path)
|
||||
with open(_actual_path + '/X.pickle', 'wb') as outfile:
|
||||
pickle.dump(lZ, outfile)
|
||||
self.is_training = False
|
||||
return lZ
|
||||
|
||||
def fit_transform(self, lX, ly=None, lV=None):
|
||||
|
@ -105,10 +124,8 @@ class PosteriorProbabilitiesEmbedder:
|
|||
|
||||
def predict_proba(self, lX, ly=None):
|
||||
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents')
|
||||
return self.doc_projector.predict_proba(lX)
|
||||
|
||||
def _get_output_dim(self):
|
||||
return len(self.doc_projector.model['da'].model.classes_)
|
||||
lZ = self.doc_projector.predict_proba(lX)
|
||||
return lZ
|
||||
|
||||
|
||||
class MuseEmbedder:
|
||||
|
@ -222,8 +239,8 @@ class MBertEmbedder:
|
|||
|
||||
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
|
||||
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
|
||||
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
|
||||
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
|
||||
tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=True)
|
||||
va_dataloader = DataLoader(va_dataset, batch_size=64, shuffle=True)
|
||||
|
||||
nC = tr_dataset.get_nclasses()
|
||||
model = get_model(nC)
|
||||
|
@ -272,7 +289,7 @@ class MBertEmbedder:
|
|||
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
|
||||
feat_dataset = ExtractorDataset(l_tokenized_X)
|
||||
feat_lang_ids = feat_dataset.lang_ids
|
||||
dataloader = DataLoader(feat_dataset, batch_size=64)
|
||||
dataloader = DataLoader(feat_dataset, batch_size=64) # TODO reduced batch size in JRC experiments
|
||||
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
|
||||
return all_batch_embeddings
|
||||
|
||||
|
@ -326,15 +343,8 @@ class RecurrentEmbedder:
|
|||
self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience,
|
||||
checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}')
|
||||
|
||||
|
||||
# Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities
|
||||
self.posteriorEmbedder = MetaClassifier(
|
||||
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs)
|
||||
|
||||
def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1):
|
||||
def fit(self, lX, ly, lV=None, batch_size=128, nepochs=200, val_epochs=1):
|
||||
print('### Gated Recurrent Unit View Generator (G)')
|
||||
# self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
|
||||
# could be better to init model here at first .fit() call!
|
||||
if self.model is None:
|
||||
print('TODO: Init model!')
|
||||
if not self.is_trained:
|
||||
|
@ -358,7 +368,7 @@ class RecurrentEmbedder:
|
|||
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
|
||||
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
|
||||
ltrain_bert=None)
|
||||
self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
|
||||
self.lr_scheduler.step()
|
||||
|
||||
# validation step
|
||||
macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch,
|
||||
|
@ -384,21 +394,15 @@ class RecurrentEmbedder:
|
|||
ltrain_bert=None)
|
||||
self.is_trained = True
|
||||
|
||||
# Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities
|
||||
# lX = self._get_doc_embeddings(lX)
|
||||
lX = self._get_doc_embeddings(self.multilingual_index.l_devel_index())
|
||||
# Fit a ''multi-lingual'' SVM on the generated doc embeddings
|
||||
self.posteriorEmbedder.fit(lX, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX, batch_size=64):
|
||||
lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
|
||||
lX = self._get_doc_embeddings(lX)
|
||||
return self.posteriorEmbedder.predict_proba(lX)
|
||||
return lX
|
||||
|
||||
def fit_transform(self, lX, ly, lV=None):
|
||||
# TODO
|
||||
return 0
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def _get_doc_embeddings(self, lX, batch_size=64):
|
||||
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
|
||||
|
@ -418,7 +422,7 @@ class RecurrentEmbedder:
|
|||
|
||||
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
|
||||
def _load_pretrained_embeddings(self, we_path, langs):
|
||||
lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ?
|
||||
lpretrained = lpretrained_vocabulary = self._none_dict(langs)
|
||||
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
|
||||
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
|
||||
return lpretrained, lpretrained_vocabulary
|
||||
|
@ -495,26 +499,15 @@ class DocEmbedderList:
|
|||
return self.embedders[0].transform(lX)
|
||||
|
||||
langs = sorted(lX.keys())
|
||||
|
||||
lZparts = {l: None for l in langs}
|
||||
|
||||
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
|
||||
min_dim = 73 # TODO <---- this should be the number of target classes
|
||||
|
||||
for transformer in self.embedders:
|
||||
_lX = lX
|
||||
if transformer.requires_tfidf:
|
||||
_lX = tfidf
|
||||
lZ = transformer.transform(_lX)
|
||||
nC = min([lZ[lang].shape[1] for lang in langs])
|
||||
for l in langs:
|
||||
Z = lZ[l]
|
||||
if Z.shape[1] > min_dim:
|
||||
print(
|
||||
f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
|
||||
f'Applying PCA(n_components={min_dim})')
|
||||
pca = PCA(n_components=min_dim)
|
||||
Z = pca.fit(Z).transform(Z)
|
||||
if lZparts[l] is None:
|
||||
lZparts[l] = Z
|
||||
else:
|
||||
|
@ -535,7 +528,7 @@ class DocEmbedderList:
|
|||
|
||||
|
||||
class FeatureSet2Posteriors:
|
||||
def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1):
|
||||
def __init__(self, transformer, method_id, requires_tfidf=False, l2=True, n_jobs=-1, storing_path='../dumps/'):
|
||||
self.transformer = transformer
|
||||
self.l2 = l2
|
||||
self.n_jobs = n_jobs
|
||||
|
@ -543,7 +536,15 @@ class FeatureSet2Posteriors:
|
|||
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
||||
self.requires_tfidf = requires_tfidf
|
||||
|
||||
self.storing_path = storing_path
|
||||
self.is_training = True
|
||||
self.method_id = method_id
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
|
||||
print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
|
||||
return self
|
||||
|
||||
if lV is None and hasattr(self.transformer, 'lV'):
|
||||
lV = self.transformer.lV
|
||||
lZ = self.transformer.fit_transform(lX, ly, lV)
|
||||
|
@ -551,8 +552,22 @@ class FeatureSet2Posteriors:
|
|||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
# if dir exist, load and return already computed results
|
||||
_endpoint = 'tr' if self.is_training else 'te'
|
||||
_actual_path = self.storing_path + '/' + _endpoint
|
||||
if exists(_actual_path):
|
||||
print('NB: loading pre-computed results!')
|
||||
with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile:
|
||||
self.is_training = False
|
||||
return pickle.load(infile)
|
||||
|
||||
lP = self.predict_proba(lX)
|
||||
lP = _normalize(lP, self.l2)
|
||||
# create dir and dump computed results
|
||||
create_if_not_exist(_actual_path)
|
||||
with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile:
|
||||
pickle.dump(lP, outfile)
|
||||
self.is_training = False
|
||||
return lP
|
||||
|
||||
def fit_transform(self, lX, ly, lV):
|
||||
|
@ -691,7 +706,7 @@ def word_class_embedding_matrix(X, Y, max_label_space=300):
|
|||
def XdotM(X, M, sif):
|
||||
E = X.dot(M)
|
||||
if sif:
|
||||
print("removing pc...")
|
||||
# print("removing pc...")
|
||||
E = remove_pc(E, npc=1)
|
||||
return E
|
||||
|
||||
|
@ -714,7 +729,7 @@ class BatchGRU:
|
|||
|
||||
def batchify(self, l_index, l_post, l_bert, llabels, extractor=False):
|
||||
langs = self.languages
|
||||
l_num_samples = {l:len(l_index[l]) for l in langs}
|
||||
l_num_samples = {l: len(l_index[l]) for l in langs}
|
||||
|
||||
max_samples = max(l_num_samples.values())
|
||||
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
|
||||
|
|
|
@ -28,7 +28,7 @@ if __name__ == '__main__':
|
|||
op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob)
|
||||
print(f'Method: gFun{method_name}\nDataset: {dataset_name}')
|
||||
print('-'*50)
|
||||
|
||||
|
||||
# set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect
|
||||
standardize_range = slice(0, 0)
|
||||
if op.zscore:
|
||||
|
@ -36,7 +36,7 @@ if __name__ == '__main__':
|
|||
|
||||
# load dataset
|
||||
data = MultilingualDataset.load(dataset)
|
||||
# data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING
|
||||
data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING
|
||||
data.show_dimensions()
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
@ -56,18 +56,26 @@ if __name__ == '__main__':
|
|||
View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means
|
||||
of a set of SVM.
|
||||
"""
|
||||
# Check if we already have VG outputs from previous runs
|
||||
VG_name = 'X'
|
||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||
exist = exists(storing_path)
|
||||
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
|
||||
kernel='linear',
|
||||
C=op.set_c), l2=l2))
|
||||
C=op.set_c),
|
||||
l2=l2, storing_path=storing_path))
|
||||
|
||||
if op.supervised:
|
||||
"""
|
||||
View Generator (-W): generates document representation via Word-Class-Embeddings.
|
||||
Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
"""
|
||||
VG_name = 'W'
|
||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||
exist = exists(storing_path)
|
||||
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||
if op.allprob:
|
||||
wce = FeatureSet2Posteriors(wce, requires_tfidf=True, l2=l2)
|
||||
wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
|
||||
doc_embedder.append(wce)
|
||||
|
||||
if op.pretrained:
|
||||
|
@ -75,30 +83,41 @@ if __name__ == '__main__':
|
|||
View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word
|
||||
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
"""
|
||||
VG_name = 'M'
|
||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||
exist = exists(storing_path)
|
||||
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||
if op.allprob:
|
||||
muse = FeatureSet2Posteriors(muse, requires_tfidf=True, l2=l2)
|
||||
muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
|
||||
doc_embedder.append(muse)
|
||||
|
||||
if op.gruViewGenerator:
|
||||
"""
|
||||
View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be
|
||||
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Such
|
||||
document embeddings are then casted into vectors of posterior probabilities via a set of SVM.
|
||||
NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob
|
||||
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
|
||||
Output dimension is (n_docs, 512). If --allprob output will be casted to posterior prob space via SVM.
|
||||
"""
|
||||
VG_name = 'G'
|
||||
VG_name += '_muse' if op.gruMUSE else ''
|
||||
VG_name += '_wce' if op.gruWCE else ''
|
||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
|
||||
options=op, model_path=op.gru_path)
|
||||
if op.allprob:
|
||||
rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False,
|
||||
storing_path=storing_path)
|
||||
doc_embedder.append(rnn_embedder)
|
||||
|
||||
if op.mbert:
|
||||
"""
|
||||
View generator (-B): generates document embedding via mBERT model.
|
||||
"""
|
||||
mbert = MBertEmbedder(path_to_model=op.bert_path,
|
||||
nC=data.num_categories())
|
||||
VG_name = 'B'
|
||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||
|
||||
mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories())
|
||||
if op.allprob:
|
||||
mbert = FeatureSet2Posteriors(mbert, l2=l2)
|
||||
mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path)
|
||||
doc_embedder.append(mbert)
|
||||
|
||||
# metaclassifier
|
||||
|
|
|
@ -5,6 +5,7 @@ from transformers import BertForSequenceClassification, BertTokenizer, AdamW, Be
|
|||
from sklearn.model_selection import train_test_split
|
||||
from util.evaluation import *
|
||||
from time import time
|
||||
from util.common import show_gpu
|
||||
|
||||
|
||||
def predict(logits, classification_type='multilabel'):
|
||||
|
@ -21,7 +22,6 @@ def predict(logits, classification_type='multilabel'):
|
|||
class TrainingDataset(Dataset):
|
||||
"""
|
||||
data: dict of lang specific tokenized data
|
||||
labels: dict of lang specific targets
|
||||
"""
|
||||
|
||||
def __init__(self, data, labels):
|
||||
|
@ -156,7 +156,7 @@ def do_tokenization(l_dataset, max_len=512, verbose=True):
|
|||
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10):
|
||||
# _dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
# dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
dataset_id = 'TODO fix this!'
|
||||
dataset_id = 'TODO fix this!' # TODO
|
||||
|
||||
loss_history = []
|
||||
model.train()
|
||||
|
@ -231,12 +231,13 @@ def feature_extractor(data, lang_ids, model):
|
|||
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
|
||||
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
|
||||
"""
|
||||
show_gpu('Before Training')
|
||||
all_batch_embeddings = {}
|
||||
id2lang = {v: k for k, v in lang_ids.items()}
|
||||
with torch.no_grad():
|
||||
for batch, lang_idx in data:
|
||||
# for batch, target, lang_idx in data:
|
||||
out = model(batch.cuda())
|
||||
# show_gpu('After Batch Prediction')
|
||||
last_hidden_state = out[1][-1]
|
||||
batch_embeddings = last_hidden_state[:, 0, :]
|
||||
for i, l_idx in enumerate(lang_idx.numpy()):
|
||||
|
@ -245,5 +246,5 @@ def feature_extractor(data, lang_ids, model):
|
|||
else:
|
||||
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
|
||||
batch_embeddings[i].detach().cpu().numpy()))
|
||||
|
||||
show_gpu('After Full Prediction')
|
||||
return all_batch_embeddings, id2lang
|
||||
|
|
|
@ -4,7 +4,6 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|||
from sklearn.svm import SVC
|
||||
from sklearn.model_selection import train_test_split
|
||||
from embeddings.supervised import get_supervised_embeddings
|
||||
# from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
|
|
Loading…
Reference in New Issue