rsc branch;
load pre-computed VGs' output if already stored in memory
This commit is contained in:
parent
8af763b130
commit
515acae15b
|
|
@ -13,9 +13,10 @@ from scipy.sparse import csr_matrix
|
||||||
from models.mBert import *
|
from models.mBert import *
|
||||||
from models.lstm_class import *
|
from models.lstm_class import *
|
||||||
from util.csv_log import CSVLog
|
from util.csv_log import CSVLog
|
||||||
from util.file import get_file_name
|
from util.file import get_file_name, create_if_not_exist, exists
|
||||||
from util.early_stop import EarlyStopping
|
from util.early_stop import EarlyStopping
|
||||||
from util.common import *
|
from util.common import *
|
||||||
|
import pickle
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -54,7 +55,6 @@ class FeatureWeight:
|
||||||
elif self.agg == 'mean':
|
elif self.agg == 'mean':
|
||||||
F = tsr_matrix.mean(axis=0)
|
F = tsr_matrix.mean(axis=0)
|
||||||
self.lF[l] = F
|
self.lF[l] = F
|
||||||
|
|
||||||
self.fitted = True
|
self.fitted = True
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
@ -71,7 +71,7 @@ class FeatureWeight:
|
||||||
|
|
||||||
class PosteriorProbabilitiesEmbedder:
|
class PosteriorProbabilitiesEmbedder:
|
||||||
|
|
||||||
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1):
|
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1, is_training=True, storing_path='../dumps/'):
|
||||||
self.fist_tier_learner = first_tier_learner
|
self.fist_tier_learner = first_tier_learner
|
||||||
self.fist_tier_parameters = first_tier_parameters
|
self.fist_tier_parameters = first_tier_parameters
|
||||||
self.l2 = l2
|
self.l2 = l2
|
||||||
|
|
@ -80,8 +80,13 @@ class PosteriorProbabilitiesEmbedder:
|
||||||
self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
|
self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
|
||||||
)
|
)
|
||||||
self.requires_tfidf = True
|
self.requires_tfidf = True
|
||||||
|
self.storing_path = storing_path
|
||||||
|
self.is_training = is_training
|
||||||
|
|
||||||
def fit(self, lX, lY, lV=None, called_by_viewgen=False):
|
def fit(self, lX, lY, lV=None, called_by_viewgen=False):
|
||||||
|
if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
|
||||||
|
print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
|
||||||
|
return self
|
||||||
if not called_by_viewgen:
|
if not called_by_viewgen:
|
||||||
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
|
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
|
||||||
print('### Posterior Probabilities View Generator (X)')
|
print('### Posterior Probabilities View Generator (X)')
|
||||||
|
|
@ -90,8 +95,22 @@ class PosteriorProbabilitiesEmbedder:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def transform(self, lX):
|
def transform(self, lX):
|
||||||
|
# if dir exist, load and return already computed results
|
||||||
|
_endpoint = 'tr' if self.is_training else 'te'
|
||||||
|
_actual_path = self.storing_path + '/' + _endpoint
|
||||||
|
if exists(_actual_path):
|
||||||
|
print('NB: loading pre-computed results!')
|
||||||
|
with open(_actual_path + '/X.pickle', 'rb') as infile:
|
||||||
|
self.is_training = False
|
||||||
|
return pickle.load(infile)
|
||||||
|
|
||||||
lZ = self.predict_proba(lX)
|
lZ = self.predict_proba(lX)
|
||||||
lZ = _normalize(lZ, self.l2)
|
lZ = _normalize(lZ, self.l2)
|
||||||
|
# create dir and dump computed results
|
||||||
|
create_if_not_exist(_actual_path)
|
||||||
|
with open(_actual_path + '/X.pickle', 'wb') as outfile:
|
||||||
|
pickle.dump(lZ, outfile)
|
||||||
|
self.is_training = False
|
||||||
return lZ
|
return lZ
|
||||||
|
|
||||||
def fit_transform(self, lX, ly=None, lV=None):
|
def fit_transform(self, lX, ly=None, lV=None):
|
||||||
|
|
@ -105,10 +124,8 @@ class PosteriorProbabilitiesEmbedder:
|
||||||
|
|
||||||
def predict_proba(self, lX, ly=None):
|
def predict_proba(self, lX, ly=None):
|
||||||
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents')
|
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents')
|
||||||
return self.doc_projector.predict_proba(lX)
|
lZ = self.doc_projector.predict_proba(lX)
|
||||||
|
return lZ
|
||||||
def _get_output_dim(self):
|
|
||||||
return len(self.doc_projector.model['da'].model.classes_)
|
|
||||||
|
|
||||||
|
|
||||||
class MuseEmbedder:
|
class MuseEmbedder:
|
||||||
|
|
@ -222,8 +239,8 @@ class MBertEmbedder:
|
||||||
|
|
||||||
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
|
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
|
||||||
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
|
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
|
||||||
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
|
tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=True)
|
||||||
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
|
va_dataloader = DataLoader(va_dataset, batch_size=64, shuffle=True)
|
||||||
|
|
||||||
nC = tr_dataset.get_nclasses()
|
nC = tr_dataset.get_nclasses()
|
||||||
model = get_model(nC)
|
model = get_model(nC)
|
||||||
|
|
@ -272,7 +289,7 @@ class MBertEmbedder:
|
||||||
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
|
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
|
||||||
feat_dataset = ExtractorDataset(l_tokenized_X)
|
feat_dataset = ExtractorDataset(l_tokenized_X)
|
||||||
feat_lang_ids = feat_dataset.lang_ids
|
feat_lang_ids = feat_dataset.lang_ids
|
||||||
dataloader = DataLoader(feat_dataset, batch_size=64)
|
dataloader = DataLoader(feat_dataset, batch_size=64) # TODO reduced batch size in JRC experiments
|
||||||
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
|
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
|
||||||
return all_batch_embeddings
|
return all_batch_embeddings
|
||||||
|
|
||||||
|
|
@ -326,15 +343,8 @@ class RecurrentEmbedder:
|
||||||
self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience,
|
self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience,
|
||||||
checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}')
|
checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}')
|
||||||
|
|
||||||
|
def fit(self, lX, ly, lV=None, batch_size=128, nepochs=200, val_epochs=1):
|
||||||
# Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities
|
|
||||||
self.posteriorEmbedder = MetaClassifier(
|
|
||||||
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs)
|
|
||||||
|
|
||||||
def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1):
|
|
||||||
print('### Gated Recurrent Unit View Generator (G)')
|
print('### Gated Recurrent Unit View Generator (G)')
|
||||||
# self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
|
|
||||||
# could be better to init model here at first .fit() call!
|
|
||||||
if self.model is None:
|
if self.model is None:
|
||||||
print('TODO: Init model!')
|
print('TODO: Init model!')
|
||||||
if not self.is_trained:
|
if not self.is_trained:
|
||||||
|
|
@ -358,7 +368,7 @@ class RecurrentEmbedder:
|
||||||
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
|
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
|
||||||
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
|
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
|
||||||
ltrain_bert=None)
|
ltrain_bert=None)
|
||||||
self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
|
self.lr_scheduler.step()
|
||||||
|
|
||||||
# validation step
|
# validation step
|
||||||
macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch,
|
macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch,
|
||||||
|
|
@ -384,21 +394,15 @@ class RecurrentEmbedder:
|
||||||
ltrain_bert=None)
|
ltrain_bert=None)
|
||||||
self.is_trained = True
|
self.is_trained = True
|
||||||
|
|
||||||
# Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities
|
|
||||||
# lX = self._get_doc_embeddings(lX)
|
|
||||||
lX = self._get_doc_embeddings(self.multilingual_index.l_devel_index())
|
|
||||||
# Fit a ''multi-lingual'' SVM on the generated doc embeddings
|
|
||||||
self.posteriorEmbedder.fit(lX, ly)
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def transform(self, lX, batch_size=64):
|
def transform(self, lX, batch_size=64):
|
||||||
lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
|
lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
|
||||||
lX = self._get_doc_embeddings(lX)
|
lX = self._get_doc_embeddings(lX)
|
||||||
return self.posteriorEmbedder.predict_proba(lX)
|
return lX
|
||||||
|
|
||||||
def fit_transform(self, lX, ly, lV=None):
|
def fit_transform(self, lX, ly, lV=None):
|
||||||
# TODO
|
return self.fit(lX, ly).transform(lX)
|
||||||
return 0
|
|
||||||
|
|
||||||
def _get_doc_embeddings(self, lX, batch_size=64):
|
def _get_doc_embeddings(self, lX, batch_size=64):
|
||||||
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
|
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
|
||||||
|
|
@ -418,7 +422,7 @@ class RecurrentEmbedder:
|
||||||
|
|
||||||
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
|
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
|
||||||
def _load_pretrained_embeddings(self, we_path, langs):
|
def _load_pretrained_embeddings(self, we_path, langs):
|
||||||
lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ?
|
lpretrained = lpretrained_vocabulary = self._none_dict(langs)
|
||||||
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
|
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
|
||||||
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
|
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
|
||||||
return lpretrained, lpretrained_vocabulary
|
return lpretrained, lpretrained_vocabulary
|
||||||
|
|
@ -495,26 +499,15 @@ class DocEmbedderList:
|
||||||
return self.embedders[0].transform(lX)
|
return self.embedders[0].transform(lX)
|
||||||
|
|
||||||
langs = sorted(lX.keys())
|
langs = sorted(lX.keys())
|
||||||
|
|
||||||
lZparts = {l: None for l in langs}
|
lZparts = {l: None for l in langs}
|
||||||
|
|
||||||
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
|
|
||||||
min_dim = 73 # TODO <---- this should be the number of target classes
|
|
||||||
|
|
||||||
for transformer in self.embedders:
|
for transformer in self.embedders:
|
||||||
_lX = lX
|
_lX = lX
|
||||||
if transformer.requires_tfidf:
|
if transformer.requires_tfidf:
|
||||||
_lX = tfidf
|
_lX = tfidf
|
||||||
lZ = transformer.transform(_lX)
|
lZ = transformer.transform(_lX)
|
||||||
nC = min([lZ[lang].shape[1] for lang in langs])
|
|
||||||
for l in langs:
|
for l in langs:
|
||||||
Z = lZ[l]
|
Z = lZ[l]
|
||||||
if Z.shape[1] > min_dim:
|
|
||||||
print(
|
|
||||||
f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
|
|
||||||
f'Applying PCA(n_components={min_dim})')
|
|
||||||
pca = PCA(n_components=min_dim)
|
|
||||||
Z = pca.fit(Z).transform(Z)
|
|
||||||
if lZparts[l] is None:
|
if lZparts[l] is None:
|
||||||
lZparts[l] = Z
|
lZparts[l] = Z
|
||||||
else:
|
else:
|
||||||
|
|
@ -535,7 +528,7 @@ class DocEmbedderList:
|
||||||
|
|
||||||
|
|
||||||
class FeatureSet2Posteriors:
|
class FeatureSet2Posteriors:
|
||||||
def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1):
|
def __init__(self, transformer, method_id, requires_tfidf=False, l2=True, n_jobs=-1, storing_path='../dumps/'):
|
||||||
self.transformer = transformer
|
self.transformer = transformer
|
||||||
self.l2 = l2
|
self.l2 = l2
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
|
|
@ -543,7 +536,15 @@ class FeatureSet2Posteriors:
|
||||||
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
||||||
self.requires_tfidf = requires_tfidf
|
self.requires_tfidf = requires_tfidf
|
||||||
|
|
||||||
|
self.storing_path = storing_path
|
||||||
|
self.is_training = True
|
||||||
|
self.method_id = method_id
|
||||||
|
|
||||||
def fit(self, lX, ly, lV=None):
|
def fit(self, lX, ly, lV=None):
|
||||||
|
if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
|
||||||
|
print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
|
||||||
|
return self
|
||||||
|
|
||||||
if lV is None and hasattr(self.transformer, 'lV'):
|
if lV is None and hasattr(self.transformer, 'lV'):
|
||||||
lV = self.transformer.lV
|
lV = self.transformer.lV
|
||||||
lZ = self.transformer.fit_transform(lX, ly, lV)
|
lZ = self.transformer.fit_transform(lX, ly, lV)
|
||||||
|
|
@ -551,8 +552,22 @@ class FeatureSet2Posteriors:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def transform(self, lX):
|
def transform(self, lX):
|
||||||
|
# if dir exist, load and return already computed results
|
||||||
|
_endpoint = 'tr' if self.is_training else 'te'
|
||||||
|
_actual_path = self.storing_path + '/' + _endpoint
|
||||||
|
if exists(_actual_path):
|
||||||
|
print('NB: loading pre-computed results!')
|
||||||
|
with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile:
|
||||||
|
self.is_training = False
|
||||||
|
return pickle.load(infile)
|
||||||
|
|
||||||
lP = self.predict_proba(lX)
|
lP = self.predict_proba(lX)
|
||||||
lP = _normalize(lP, self.l2)
|
lP = _normalize(lP, self.l2)
|
||||||
|
# create dir and dump computed results
|
||||||
|
create_if_not_exist(_actual_path)
|
||||||
|
with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile:
|
||||||
|
pickle.dump(lP, outfile)
|
||||||
|
self.is_training = False
|
||||||
return lP
|
return lP
|
||||||
|
|
||||||
def fit_transform(self, lX, ly, lV):
|
def fit_transform(self, lX, ly, lV):
|
||||||
|
|
@ -691,7 +706,7 @@ def word_class_embedding_matrix(X, Y, max_label_space=300):
|
||||||
def XdotM(X, M, sif):
|
def XdotM(X, M, sif):
|
||||||
E = X.dot(M)
|
E = X.dot(M)
|
||||||
if sif:
|
if sif:
|
||||||
print("removing pc...")
|
# print("removing pc...")
|
||||||
E = remove_pc(E, npc=1)
|
E = remove_pc(E, npc=1)
|
||||||
return E
|
return E
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# load dataset
|
# load dataset
|
||||||
data = MultilingualDataset.load(dataset)
|
data = MultilingualDataset.load(dataset)
|
||||||
# data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING
|
data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING
|
||||||
data.show_dimensions()
|
data.show_dimensions()
|
||||||
lXtr, lytr = data.training()
|
lXtr, lytr = data.training()
|
||||||
lXte, lyte = data.test()
|
lXte, lyte = data.test()
|
||||||
|
|
@ -56,18 +56,26 @@ if __name__ == '__main__':
|
||||||
View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means
|
View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means
|
||||||
of a set of SVM.
|
of a set of SVM.
|
||||||
"""
|
"""
|
||||||
|
# Check if we already have VG outputs from previous runs
|
||||||
|
VG_name = 'X'
|
||||||
|
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||||
|
exist = exists(storing_path)
|
||||||
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
|
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
|
||||||
kernel='linear',
|
kernel='linear',
|
||||||
C=op.set_c), l2=l2))
|
C=op.set_c),
|
||||||
|
l2=l2, storing_path=storing_path))
|
||||||
|
|
||||||
if op.supervised:
|
if op.supervised:
|
||||||
"""
|
"""
|
||||||
View Generator (-W): generates document representation via Word-Class-Embeddings.
|
View Generator (-W): generates document representation via Word-Class-Embeddings.
|
||||||
Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||||
"""
|
"""
|
||||||
|
VG_name = 'W'
|
||||||
|
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||||
|
exist = exists(storing_path)
|
||||||
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||||
if op.allprob:
|
if op.allprob:
|
||||||
wce = FeatureSet2Posteriors(wce, requires_tfidf=True, l2=l2)
|
wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
|
||||||
doc_embedder.append(wce)
|
doc_embedder.append(wce)
|
||||||
|
|
||||||
if op.pretrained:
|
if op.pretrained:
|
||||||
|
|
@ -75,30 +83,41 @@ if __name__ == '__main__':
|
||||||
View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word
|
View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word
|
||||||
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||||
"""
|
"""
|
||||||
|
VG_name = 'M'
|
||||||
|
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||||
|
exist = exists(storing_path)
|
||||||
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||||
if op.allprob:
|
if op.allprob:
|
||||||
muse = FeatureSet2Posteriors(muse, requires_tfidf=True, l2=l2)
|
muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
|
||||||
doc_embedder.append(muse)
|
doc_embedder.append(muse)
|
||||||
|
|
||||||
if op.gruViewGenerator:
|
if op.gruViewGenerator:
|
||||||
"""
|
"""
|
||||||
View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be
|
View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be
|
||||||
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Such
|
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
|
||||||
document embeddings are then casted into vectors of posterior probabilities via a set of SVM.
|
Output dimension is (n_docs, 512). If --allprob output will be casted to posterior prob space via SVM.
|
||||||
NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob
|
|
||||||
"""
|
"""
|
||||||
|
VG_name = 'G'
|
||||||
|
VG_name += '_muse' if op.gruMUSE else ''
|
||||||
|
VG_name += '_wce' if op.gruWCE else ''
|
||||||
|
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||||
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
|
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
|
||||||
options=op, model_path=op.gru_path)
|
options=op, model_path=op.gru_path)
|
||||||
|
if op.allprob:
|
||||||
|
rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False,
|
||||||
|
storing_path=storing_path)
|
||||||
doc_embedder.append(rnn_embedder)
|
doc_embedder.append(rnn_embedder)
|
||||||
|
|
||||||
if op.mbert:
|
if op.mbert:
|
||||||
"""
|
"""
|
||||||
View generator (-B): generates document embedding via mBERT model.
|
View generator (-B): generates document embedding via mBERT model.
|
||||||
"""
|
"""
|
||||||
mbert = MBertEmbedder(path_to_model=op.bert_path,
|
VG_name = 'B'
|
||||||
nC=data.num_categories())
|
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||||
|
|
||||||
|
mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories())
|
||||||
if op.allprob:
|
if op.allprob:
|
||||||
mbert = FeatureSet2Posteriors(mbert, l2=l2)
|
mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path)
|
||||||
doc_embedder.append(mbert)
|
doc_embedder.append(mbert)
|
||||||
|
|
||||||
# metaclassifier
|
# metaclassifier
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ from transformers import BertForSequenceClassification, BertTokenizer, AdamW, Be
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from util.evaluation import *
|
from util.evaluation import *
|
||||||
from time import time
|
from time import time
|
||||||
|
from util.common import show_gpu
|
||||||
|
|
||||||
|
|
||||||
def predict(logits, classification_type='multilabel'):
|
def predict(logits, classification_type='multilabel'):
|
||||||
|
|
@ -21,7 +22,6 @@ def predict(logits, classification_type='multilabel'):
|
||||||
class TrainingDataset(Dataset):
|
class TrainingDataset(Dataset):
|
||||||
"""
|
"""
|
||||||
data: dict of lang specific tokenized data
|
data: dict of lang specific tokenized data
|
||||||
labels: dict of lang specific targets
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, data, labels):
|
def __init__(self, data, labels):
|
||||||
|
|
@ -156,7 +156,7 @@ def do_tokenization(l_dataset, max_len=512, verbose=True):
|
||||||
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10):
|
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10):
|
||||||
# _dataset_path = opt.dataset.split('/')[-1].split('_')
|
# _dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||||
# dataset_id = _dataset_path[0] + _dataset_path[-1]
|
# dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||||
dataset_id = 'TODO fix this!'
|
dataset_id = 'TODO fix this!' # TODO
|
||||||
|
|
||||||
loss_history = []
|
loss_history = []
|
||||||
model.train()
|
model.train()
|
||||||
|
|
@ -231,12 +231,13 @@ def feature_extractor(data, lang_ids, model):
|
||||||
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
|
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
|
||||||
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
|
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
|
||||||
"""
|
"""
|
||||||
|
show_gpu('Before Training')
|
||||||
all_batch_embeddings = {}
|
all_batch_embeddings = {}
|
||||||
id2lang = {v: k for k, v in lang_ids.items()}
|
id2lang = {v: k for k, v in lang_ids.items()}
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
for batch, lang_idx in data:
|
for batch, lang_idx in data:
|
||||||
# for batch, target, lang_idx in data:
|
|
||||||
out = model(batch.cuda())
|
out = model(batch.cuda())
|
||||||
|
# show_gpu('After Batch Prediction')
|
||||||
last_hidden_state = out[1][-1]
|
last_hidden_state = out[1][-1]
|
||||||
batch_embeddings = last_hidden_state[:, 0, :]
|
batch_embeddings = last_hidden_state[:, 0, :]
|
||||||
for i, l_idx in enumerate(lang_idx.numpy()):
|
for i, l_idx in enumerate(lang_idx.numpy()):
|
||||||
|
|
@ -245,5 +246,5 @@ def feature_extractor(data, lang_ids, model):
|
||||||
else:
|
else:
|
||||||
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
|
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
|
||||||
batch_embeddings[i].detach().cpu().numpy()))
|
batch_embeddings[i].detach().cpu().numpy()))
|
||||||
|
show_gpu('After Full Prediction')
|
||||||
return all_batch_embeddings, id2lang
|
return all_batch_embeddings, id2lang
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from embeddings.supervised import get_supervised_embeddings
|
from embeddings.supervised import get_supervised_embeddings
|
||||||
# from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import torch
|
import torch
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue