refactoring
This commit is contained in:
parent
515acae15b
commit
a5322ba227
|
@ -148,7 +148,7 @@ class MonolingualClassifier:
|
||||||
if isinstance(self.model, GridSearchCV):
|
if isinstance(self.model, GridSearchCV):
|
||||||
self.best_params_ = self.model.best_params_
|
self.best_params_ = self.model.best_params_
|
||||||
print('best parameters: ', self.best_params_)
|
print('best parameters: ', self.best_params_)
|
||||||
self.time=time.time()-tinit
|
self.time = time.time()-tinit
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def decision_function(self, X):
|
def decision_function(self, X):
|
||||||
|
|
|
@ -84,9 +84,9 @@ class PosteriorProbabilitiesEmbedder:
|
||||||
self.is_training = is_training
|
self.is_training = is_training
|
||||||
|
|
||||||
def fit(self, lX, lY, lV=None, called_by_viewgen=False):
|
def fit(self, lX, lY, lV=None, called_by_viewgen=False):
|
||||||
if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
|
# if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
|
||||||
print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
|
# print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
|
||||||
return self
|
# return self
|
||||||
if not called_by_viewgen:
|
if not called_by_viewgen:
|
||||||
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
|
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
|
||||||
print('### Posterior Probabilities View Generator (X)')
|
print('### Posterior Probabilities View Generator (X)')
|
||||||
|
@ -96,20 +96,20 @@ class PosteriorProbabilitiesEmbedder:
|
||||||
|
|
||||||
def transform(self, lX):
|
def transform(self, lX):
|
||||||
# if dir exist, load and return already computed results
|
# if dir exist, load and return already computed results
|
||||||
_endpoint = 'tr' if self.is_training else 'te'
|
# _endpoint = 'tr' if self.is_training else 'te'
|
||||||
_actual_path = self.storing_path + '/' + _endpoint
|
# _actual_path = self.storing_path + '/' + _endpoint
|
||||||
if exists(_actual_path):
|
# if exists(_actual_path):
|
||||||
print('NB: loading pre-computed results!')
|
# print('NB: loading pre-computed results!')
|
||||||
with open(_actual_path + '/X.pickle', 'rb') as infile:
|
# with open(_actual_path + '/X.pickle', 'rb') as infile:
|
||||||
self.is_training = False
|
# self.is_training = False
|
||||||
return pickle.load(infile)
|
# return pickle.load(infile)
|
||||||
|
|
||||||
lZ = self.predict_proba(lX)
|
lZ = self.predict_proba(lX)
|
||||||
lZ = _normalize(lZ, self.l2)
|
lZ = _normalize(lZ, self.l2)
|
||||||
# create dir and dump computed results
|
# create dir and dump computed results
|
||||||
create_if_not_exist(_actual_path)
|
# create_if_not_exist(_actual_path)
|
||||||
with open(_actual_path + '/X.pickle', 'wb') as outfile:
|
# with open(_actual_path + '/X.pickle', 'wb') as outfile:
|
||||||
pickle.dump(lZ, outfile)
|
# pickle.dump(lZ, outfile)
|
||||||
self.is_training = False
|
self.is_training = False
|
||||||
return lZ
|
return lZ
|
||||||
|
|
||||||
|
@ -154,8 +154,7 @@ class MuseEmbedder:
|
||||||
MUSE = self.MUSE
|
MUSE = self.MUSE
|
||||||
lX = self.featureweight.transform(lX)
|
lX = self.featureweight.transform(lX)
|
||||||
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
||||||
delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs
|
delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs)
|
||||||
)
|
|
||||||
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
|
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
|
||||||
lMuse = _normalize(lMuse, self.l2)
|
lMuse = _normalize(lMuse, self.l2)
|
||||||
return lMuse
|
return lMuse
|
||||||
|
@ -211,18 +210,22 @@ class WordClassEmbedder:
|
||||||
class MBertEmbedder:
|
class MBertEmbedder:
|
||||||
|
|
||||||
def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None,
|
def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None,
|
||||||
nC=None):
|
nC=None, avoid_loading=False):
|
||||||
self.doc_embed_path = doc_embed_path
|
self.doc_embed_path = doc_embed_path
|
||||||
self.patience = patience
|
self.patience = patience
|
||||||
self.checkpoint_dir = checkpoint_dir
|
self.checkpoint_dir = checkpoint_dir
|
||||||
self.fitted = False
|
self.fitted = False
|
||||||
self.requires_tfidf = False
|
self.requires_tfidf = False
|
||||||
if path_to_model is None and nC is not None:
|
self.avoid_loading = avoid_loading
|
||||||
|
if path_to_model is None:
|
||||||
self.model = None
|
self.model = None
|
||||||
else:
|
else:
|
||||||
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
|
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
|
||||||
num_labels=nC)
|
num_labels=nC)
|
||||||
self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda()
|
if self.avoid_loading:
|
||||||
|
self.model = None
|
||||||
|
else:
|
||||||
|
self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda() # TODO: setting model to None in order to avoid loading it onto gpu if we have already pre-computed results!
|
||||||
self.fitted = True
|
self.fitted = True
|
||||||
|
|
||||||
def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1):
|
def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1):
|
||||||
|
@ -235,7 +238,7 @@ class MBertEmbedder:
|
||||||
l_tokenized_tr = do_tokenization(lX, max_len=512)
|
l_tokenized_tr = do_tokenization(lX, max_len=512)
|
||||||
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly,
|
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly,
|
||||||
val_prop=0.2, max_val=2000,
|
val_prop=0.2, max_val=2000,
|
||||||
seed=seed) # TODO: seed
|
seed=seed)
|
||||||
|
|
||||||
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
|
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
|
||||||
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
|
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
|
||||||
|
@ -289,7 +292,7 @@ class MBertEmbedder:
|
||||||
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
|
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
|
||||||
feat_dataset = ExtractorDataset(l_tokenized_X)
|
feat_dataset = ExtractorDataset(l_tokenized_X)
|
||||||
feat_lang_ids = feat_dataset.lang_ids
|
feat_lang_ids = feat_dataset.lang_ids
|
||||||
dataloader = DataLoader(feat_dataset, batch_size=64) # TODO reduced batch size in JRC experiments
|
dataloader = DataLoader(feat_dataset, batch_size=64)
|
||||||
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
|
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
|
||||||
return all_batch_embeddings
|
return all_batch_embeddings
|
||||||
|
|
||||||
|
@ -301,7 +304,7 @@ class RecurrentEmbedder:
|
||||||
|
|
||||||
def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3,
|
def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3,
|
||||||
we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10,
|
we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10,
|
||||||
test_each=0, checkpoint_dir='../checkpoint', model_path=None):
|
test_each=0, checkpoint_dir='../checkpoint', model_path=None, n_jobs=-1):
|
||||||
self.pretrained = pretrained
|
self.pretrained = pretrained
|
||||||
self.supervised = supervised
|
self.supervised = supervised
|
||||||
self.concat = concat
|
self.concat = concat
|
||||||
|
@ -319,6 +322,7 @@ class RecurrentEmbedder:
|
||||||
self.options = options
|
self.options = options
|
||||||
self.seed = options.seed
|
self.seed = options.seed
|
||||||
self.model_path = model_path
|
self.model_path = model_path
|
||||||
|
self.n_jobs = n_jobs
|
||||||
self.is_trained = False
|
self.is_trained = False
|
||||||
|
|
||||||
## INIT MODEL for training
|
## INIT MODEL for training
|
||||||
|
@ -398,32 +402,33 @@ class RecurrentEmbedder:
|
||||||
|
|
||||||
def transform(self, lX, batch_size=64):
|
def transform(self, lX, batch_size=64):
|
||||||
lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
|
lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
|
||||||
lX = self._get_doc_embeddings(lX)
|
lX = self._get_doc_embeddings(lX, batch_size)
|
||||||
return lX
|
return lX
|
||||||
|
|
||||||
def fit_transform(self, lX, ly, lV=None):
|
def fit_transform(self, lX, ly, lV=None):
|
||||||
return self.fit(lX, ly).transform(lX)
|
return self.fit(lX, ly).transform(lX)
|
||||||
|
|
||||||
def _get_doc_embeddings(self, lX, batch_size=64):
|
def _get_doc_embeddings(self, lX, batch_size):
|
||||||
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
|
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
|
||||||
print('Generating document embeddings via GRU')
|
print('Generating document embeddings via GRU')
|
||||||
_lX = {}
|
_lX = {}
|
||||||
|
|
||||||
l_devel_target = self.multilingual_index.l_devel_target()
|
l_devel_target = self.multilingual_index.l_devel_target()
|
||||||
|
|
||||||
|
# show_gpu('RNN init at extraction')
|
||||||
for idx, (batch, post, target, lang) in enumerate(batchify(lX, None, l_devel_target,
|
for idx, (batch, post, target, lang) in enumerate(batchify(lX, None, l_devel_target,
|
||||||
batch_size, self.multilingual_index.l_pad())):
|
batch_size, self.multilingual_index.l_pad())):
|
||||||
if lang not in _lX.keys():
|
if lang not in _lX.keys():
|
||||||
_lX[lang] = self.model.get_embeddings(batch, lang)
|
_lX[lang] = self.model.get_embeddings(batch, lang)
|
||||||
else:
|
else:
|
||||||
_lX[lang] = np.concatenate((_lX[lang], self.model.get_embeddings(batch, lang)), axis=0)
|
_lX[lang] = np.concatenate((_lX[lang], self.model.get_embeddings(batch, lang)), axis=0)
|
||||||
|
# show_gpu('RNN after batch pred at extraction')
|
||||||
return _lX
|
return _lX
|
||||||
|
|
||||||
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
|
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
|
||||||
def _load_pretrained_embeddings(self, we_path, langs):
|
def _load_pretrained_embeddings(self, we_path, langs):
|
||||||
lpretrained = lpretrained_vocabulary = self._none_dict(langs)
|
lpretrained = lpretrained_vocabulary = self._none_dict(langs)
|
||||||
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
|
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=self.n_jobs)
|
||||||
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
|
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
|
||||||
return lpretrained, lpretrained_vocabulary
|
return lpretrained, lpretrained_vocabulary
|
||||||
|
|
||||||
|
@ -553,20 +558,20 @@ class FeatureSet2Posteriors:
|
||||||
|
|
||||||
def transform(self, lX):
|
def transform(self, lX):
|
||||||
# if dir exist, load and return already computed results
|
# if dir exist, load and return already computed results
|
||||||
_endpoint = 'tr' if self.is_training else 'te'
|
# _endpoint = 'tr' if self.is_training else 'te'
|
||||||
_actual_path = self.storing_path + '/' + _endpoint
|
# _actual_path = self.storing_path + '/' + _endpoint
|
||||||
if exists(_actual_path):
|
# if exists(_actual_path):
|
||||||
print('NB: loading pre-computed results!')
|
# print('NB: loading pre-computed results!')
|
||||||
with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile:
|
# with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile:
|
||||||
self.is_training = False
|
# self.is_training = False
|
||||||
return pickle.load(infile)
|
# return pickle.load(infile)
|
||||||
|
|
||||||
lP = self.predict_proba(lX)
|
lP = self.predict_proba(lX)
|
||||||
lP = _normalize(lP, self.l2)
|
lP = _normalize(lP, self.l2)
|
||||||
# create dir and dump computed results
|
# create dir and dump computed results
|
||||||
create_if_not_exist(_actual_path)
|
# create_if_not_exist(_actual_path)
|
||||||
with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile:
|
# with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile:
|
||||||
pickle.dump(lP, outfile)
|
# pickle.dump(lP, outfile)
|
||||||
self.is_training = False
|
self.is_training = False
|
||||||
return lP
|
return lP
|
||||||
|
|
||||||
|
@ -637,8 +642,14 @@ class Funnelling:
|
||||||
self.meta = meta
|
self.meta = meta
|
||||||
self.n_jobs = meta.n_jobs
|
self.n_jobs = meta.n_jobs
|
||||||
|
|
||||||
def fit(self, lX, ly):
|
def fit(self, lX, ly, target_lang=None):
|
||||||
tfidf_lX = self.vectorizer.fit_transform(lX, ly)
|
if target_lang is not None:
|
||||||
|
LX = lX.copy()
|
||||||
|
LX.update(target_lang)
|
||||||
|
self.vectorizer.fit(LX)
|
||||||
|
tfidf_lX = self.vectorizer.transform(lX)
|
||||||
|
else:
|
||||||
|
tfidf_lX = self.vectorizer.fit_transform(lX, ly)
|
||||||
lV = self.vectorizer.vocabulary()
|
lV = self.vectorizer.vocabulary()
|
||||||
print('## Fitting first-tier learners!')
|
print('## Fitting first-tier learners!')
|
||||||
lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX)
|
lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX)
|
||||||
|
@ -774,6 +785,7 @@ def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, opt
|
||||||
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||||
|
|
||||||
|
# show_gpu('RNN init pre-training')
|
||||||
loss_history = []
|
loss_history = []
|
||||||
model.train()
|
model.train()
|
||||||
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
|
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
|
||||||
|
@ -783,6 +795,7 @@ def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, opt
|
||||||
clip_gradient(model)
|
clip_gradient(model)
|
||||||
optim.step()
|
optim.step()
|
||||||
loss_history.append(loss.item())
|
loss_history.append(loss.item())
|
||||||
|
# show_gpu('RNN after batch prediction')
|
||||||
|
|
||||||
if idx % log_interval == 0:
|
if idx % log_interval == 0:
|
||||||
interval_loss = np.mean(loss_history[-log_interval:])
|
interval_loss = np.mean(loss_history[-log_interval:])
|
||||||
|
@ -810,7 +823,7 @@ def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tini
|
||||||
yte_stacked[lang].append(target.detach().cpu().numpy())
|
yte_stacked[lang].append(target.detach().cpu().numpy())
|
||||||
loss_history.append(loss)
|
loss_history.append(loss)
|
||||||
|
|
||||||
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
|
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
|
||||||
ly_ = {l:np.vstack(predictions[l]) for l in langs}
|
ly_ = {l:np.vstack(predictions[l]) for l in langs}
|
||||||
l_eval = evaluate(ly, ly_)
|
l_eval = evaluate(ly, ly_)
|
||||||
metrics = []
|
metrics = []
|
||||||
|
|
|
@ -13,30 +13,31 @@ if __name__ == '__main__':
|
||||||
assert exists(dataset), 'Unable to find file '+str(dataset)
|
assert exists(dataset), 'Unable to find file '+str(dataset)
|
||||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||||
assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \
|
assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \
|
||||||
'empty set of document embeddings is not allowed'
|
'empty set of document embeddings is not allowed'
|
||||||
if op.gruViewGenerator:
|
if op.gruViewGenerator:
|
||||||
assert op.gruWCE or op.gruMUSE, 'Initializing Gated Recurrent embedding layer without ' \
|
assert op.gruWCE or op.gruMUSE, 'Initializing Gated Recurrent embedding layer without ' \
|
||||||
'explicit initialization of GRU View Generator'
|
'explicit initialization of GRU View Generator'
|
||||||
|
|
||||||
l2 = op.l2
|
l2 = op.l2
|
||||||
dataset_file = os.path.basename(dataset)
|
dataset_file = os.path.basename(dataset)
|
||||||
results = PolylingualClassificationResults('../log/' + op.output)
|
results = PolylingualClassificationResults('../log/' + op.output)
|
||||||
allprob = 'Prob' if op.allprob else ''
|
allprob = 'Prob' if op.allprob else ''
|
||||||
|
|
||||||
# renaming arguments to be printed on log
|
|
||||||
method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert,
|
method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert,
|
||||||
op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob)
|
op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob)
|
||||||
|
|
||||||
print(f'Method: gFun{method_name}\nDataset: {dataset_name}')
|
print(f'Method: gFun{method_name}\nDataset: {dataset_name}')
|
||||||
print('-'*50)
|
print('-'*50)
|
||||||
|
|
||||||
# set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect
|
n_jobs = -1 # TODO SETTING n_JOBS
|
||||||
|
|
||||||
standardize_range = slice(0, 0)
|
standardize_range = slice(0, 0)
|
||||||
if op.zscore:
|
if op.zscore:
|
||||||
standardize_range = None
|
standardize_range = None
|
||||||
|
|
||||||
# load dataset
|
# load dataset
|
||||||
data = MultilingualDataset.load(dataset)
|
data = MultilingualDataset.load(dataset)
|
||||||
data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING
|
# data.set_view(languages=['it']) # TODO: DEBUG SETTING
|
||||||
data.show_dimensions()
|
data.show_dimensions()
|
||||||
lXtr, lytr = data.training()
|
lXtr, lytr = data.training()
|
||||||
lXte, lyte = data.test()
|
lXte, lyte = data.test()
|
||||||
|
@ -63,7 +64,7 @@ if __name__ == '__main__':
|
||||||
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
|
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
|
||||||
kernel='linear',
|
kernel='linear',
|
||||||
C=op.set_c),
|
C=op.set_c),
|
||||||
l2=l2, storing_path=storing_path))
|
l2=l2, storing_path=storing_path, n_jobs=n_jobs))
|
||||||
|
|
||||||
if op.supervised:
|
if op.supervised:
|
||||||
"""
|
"""
|
||||||
|
@ -73,9 +74,11 @@ if __name__ == '__main__':
|
||||||
VG_name = 'W'
|
VG_name = 'W'
|
||||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||||
exist = exists(storing_path)
|
exist = exists(storing_path)
|
||||||
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting,
|
||||||
|
sif=op.sif, n_jobs=n_jobs)
|
||||||
if op.allprob:
|
if op.allprob:
|
||||||
wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
|
wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path,
|
||||||
|
n_jobs=n_jobs)
|
||||||
doc_embedder.append(wce)
|
doc_embedder.append(wce)
|
||||||
|
|
||||||
if op.pretrained:
|
if op.pretrained:
|
||||||
|
@ -86,9 +89,10 @@ if __name__ == '__main__':
|
||||||
VG_name = 'M'
|
VG_name = 'M'
|
||||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||||
exist = exists(storing_path)
|
exist = exists(storing_path)
|
||||||
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif, n_jobs=n_jobs)
|
||||||
if op.allprob:
|
if op.allprob:
|
||||||
muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
|
muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path,
|
||||||
|
n_jobs=n_jobs)
|
||||||
doc_embedder.append(muse)
|
doc_embedder.append(muse)
|
||||||
|
|
||||||
if op.gruViewGenerator:
|
if op.gruViewGenerator:
|
||||||
|
@ -100,12 +104,12 @@ if __name__ == '__main__':
|
||||||
VG_name = 'G'
|
VG_name = 'G'
|
||||||
VG_name += '_muse' if op.gruMUSE else ''
|
VG_name += '_muse' if op.gruMUSE else ''
|
||||||
VG_name += '_wce' if op.gruWCE else ''
|
VG_name += '_wce' if op.gruWCE else ''
|
||||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
storing_path = 'Nope' # f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||||
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
|
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
|
||||||
options=op, model_path=op.gru_path)
|
options=op, model_path=None, n_jobs=n_jobs)
|
||||||
if op.allprob:
|
if op.allprob:
|
||||||
rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False,
|
rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False,
|
||||||
storing_path=storing_path)
|
storing_path=storing_path, n_jobs=n_jobs)
|
||||||
doc_embedder.append(rnn_embedder)
|
doc_embedder.append(rnn_embedder)
|
||||||
|
|
||||||
if op.mbert:
|
if op.mbert:
|
||||||
|
@ -114,8 +118,9 @@ if __name__ == '__main__':
|
||||||
"""
|
"""
|
||||||
VG_name = 'B'
|
VG_name = 'B'
|
||||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||||
|
avoid_loading = False if op.avoid_loading else True # TODO research setting (set to false mBert will be loaded into gpu to get doc emebds (aka, only the first time for each run))
|
||||||
|
|
||||||
mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories())
|
mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories(), avoid_loading=avoid_loading)
|
||||||
if op.allprob:
|
if op.allprob:
|
||||||
mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path)
|
mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path)
|
||||||
doc_embedder.append(mbert)
|
doc_embedder.append(mbert)
|
||||||
|
@ -123,7 +128,7 @@ if __name__ == '__main__':
|
||||||
# metaclassifier
|
# metaclassifier
|
||||||
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||||
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c),
|
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c),
|
||||||
meta_parameters=get_params(op.optimc), standardize_range=standardize_range)
|
meta_parameters=get_params(op.optimc), standardize_range=standardize_range, n_jobs=n_jobs)
|
||||||
|
|
||||||
# ensembling the modules
|
# ensembling the modules
|
||||||
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
|
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
|
||||||
|
|
|
@ -20,9 +20,6 @@ def predict(logits, classification_type='multilabel'):
|
||||||
|
|
||||||
|
|
||||||
class TrainingDataset(Dataset):
|
class TrainingDataset(Dataset):
|
||||||
"""
|
|
||||||
data: dict of lang specific tokenized data
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, data, labels):
|
def __init__(self, data, labels):
|
||||||
self.langs = data.keys()
|
self.langs = data.keys()
|
||||||
|
@ -231,7 +228,7 @@ def feature_extractor(data, lang_ids, model):
|
||||||
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
|
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
|
||||||
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
|
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
|
||||||
"""
|
"""
|
||||||
show_gpu('Before Training')
|
# show_gpu('Before Training')
|
||||||
all_batch_embeddings = {}
|
all_batch_embeddings = {}
|
||||||
id2lang = {v: k for k, v in lang_ids.items()}
|
id2lang = {v: k for k, v in lang_ids.items()}
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
|
@ -246,5 +243,5 @@ def feature_extractor(data, lang_ids, model):
|
||||||
else:
|
else:
|
||||||
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
|
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
|
||||||
batch_embeddings[i].detach().cpu().numpy()))
|
batch_embeddings[i].detach().cpu().numpy()))
|
||||||
show_gpu('After Full Prediction')
|
# show_gpu('After Full Prediction')
|
||||||
return all_batch_embeddings, id2lang
|
return all_batch_embeddings, id2lang
|
||||||
|
|
|
@ -74,7 +74,7 @@ class Index:
|
||||||
self.test_raw = test_raw
|
self.test_raw = test_raw
|
||||||
|
|
||||||
def index(self, pretrained_vocabulary, analyzer, vocabulary):
|
def index(self, pretrained_vocabulary, analyzer, vocabulary):
|
||||||
self.word2index = dict(vocabulary)
|
self.word2index = dict(vocabulary) # word2idx
|
||||||
known_words = set(self.word2index.keys())
|
known_words = set(self.word2index.keys())
|
||||||
if pretrained_vocabulary is not None:
|
if pretrained_vocabulary is not None:
|
||||||
known_words.update(pretrained_vocabulary)
|
known_words.update(pretrained_vocabulary)
|
||||||
|
@ -207,44 +207,6 @@ class MultilingualIndex:
|
||||||
index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
|
index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
|
||||||
self.sup_range = index.wce_range
|
self.sup_range = index.wce_range
|
||||||
|
|
||||||
# TODO circular import with transformers --> when generating posterior prob, we import PosteriorProbabilitiesEmbedder which is defined in transformers
|
|
||||||
# def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
|
|
||||||
# # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
|
|
||||||
# timeit = time.time()
|
|
||||||
# lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
|
|
||||||
# lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
|
|
||||||
# if not stored_post:
|
|
||||||
# for l in self.langs:
|
|
||||||
# n_elements = lXtr[l].shape[0]
|
|
||||||
# if n_elements > max_training_docs_by_lang:
|
|
||||||
# choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
|
|
||||||
# lXtr[l] = lXtr[l][choice]
|
|
||||||
# lYtr[l] = lYtr[l][choice]
|
|
||||||
#
|
|
||||||
# # train the posterior probabilities embedder
|
|
||||||
# print('[posteriors] training a calibrated SVM')
|
|
||||||
# learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
|
|
||||||
# prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
|
|
||||||
# prob_embedder.fit(lXtr, lYtr)
|
|
||||||
#
|
|
||||||
# # transforms the training, validation, and test sets into posterior probabilities
|
|
||||||
# print('[posteriors] generating posterior probabilities')
|
|
||||||
# lPtr = prob_embedder.transform(self.get_lXtr())
|
|
||||||
# lPva = prob_embedder.transform(self.get_lXva())
|
|
||||||
# lPte = prob_embedder.transform(self.get_lXte())
|
|
||||||
# # NB: Check splits indices !
|
|
||||||
# if store_posteriors:
|
|
||||||
# import pickle
|
|
||||||
# with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
|
|
||||||
# pickle.dump([lPtr, lPva, lPte], outfile)
|
|
||||||
# print(f'Successfully dumped posteriors!')
|
|
||||||
# else:
|
|
||||||
# import pickle
|
|
||||||
# with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
|
|
||||||
# lPtr, lPva, lPte = pickle.load(infile)
|
|
||||||
# print(f'Successfully loaded stored posteriors!')
|
|
||||||
# print(f'[posteriors] done in {time.time() - timeit}')
|
|
||||||
# return lPtr, lPva, lPte
|
|
||||||
|
|
||||||
def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False):
|
def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False):
|
||||||
show_gpu('GPU memory before initializing mBert model:')
|
show_gpu('GPU memory before initializing mBert model:')
|
||||||
|
@ -518,10 +480,12 @@ class TfidfVectorizerMultilingual:
|
||||||
def fit(self, lX, ly=None):
|
def fit(self, lX, ly=None):
|
||||||
self.langs = sorted(lX.keys())
|
self.langs = sorted(lX.keys())
|
||||||
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
|
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
|
||||||
|
# self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in lX.keys()}
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def transform(self, lX):
|
def transform(self, lX):
|
||||||
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
|
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
|
||||||
|
# return {l: self.vectorizer[l].transform(lX[l]) for l in lX.keys()}
|
||||||
|
|
||||||
def fit_transform(self, lX, ly=None):
|
def fit_transform(self, lX, ly=None):
|
||||||
return self.fit(lX, ly).transform(lX)
|
return self.fit(lX, ly).transform(lX)
|
||||||
|
@ -568,3 +532,11 @@ def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru,
|
||||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||||
return _id, dataset_id
|
return _id, dataset_id
|
||||||
|
|
||||||
|
|
||||||
|
def get_zscl_setting(langs):
|
||||||
|
settings = []
|
||||||
|
for elem in langs:
|
||||||
|
for tar in langs:
|
||||||
|
if elem != tar:
|
||||||
|
settings.append((elem, tar))
|
||||||
|
return settings
|
|
@ -1,5 +1,4 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy as np
|
|
||||||
from scipy.sparse import lil_matrix, issparse
|
from scipy.sparse import lil_matrix, issparse
|
||||||
from sklearn.metrics import f1_score, accuracy_score
|
from sklearn.metrics import f1_score, accuracy_score
|
||||||
|
|
||||||
|
|
|
@ -60,6 +60,9 @@ parser.add_option("-a", "--agg", dest="agg", action='store_true',
|
||||||
help="Set aggregation function of the common Z-space to average (Default: concatenation)",
|
help="Set aggregation function of the common Z-space to average (Default: concatenation)",
|
||||||
default=True)
|
default=True)
|
||||||
|
|
||||||
|
parser.add_option("-l", dest="avoid_loading", action="store_true",
|
||||||
|
help="TODO", default=False)
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------
|
||||||
|
|
||||||
parser.add_option('--hidden', type=int, default=512, metavar='int',
|
parser.add_option('--hidden', type=int, default=512, metavar='int',
|
||||||
|
|
|
@ -47,3 +47,46 @@ class PolylingualClassificationResults:
|
||||||
|
|
||||||
def tell(self, msg):
|
def tell(self, msg):
|
||||||
if self.verbose: print(msg)
|
if self.verbose: print(msg)
|
||||||
|
|
||||||
|
|
||||||
|
class ZSCLResults:
|
||||||
|
def __init__(self, file, autoflush=True, verbose=False):
|
||||||
|
self.file = file
|
||||||
|
self.columns = ['method',
|
||||||
|
'optimp',
|
||||||
|
'source',
|
||||||
|
'target',
|
||||||
|
'id',
|
||||||
|
'dataset',
|
||||||
|
'time',
|
||||||
|
'lang',
|
||||||
|
'macrof1',
|
||||||
|
'microf1',
|
||||||
|
'macrok',
|
||||||
|
'microk',
|
||||||
|
'notes']
|
||||||
|
self.autoflush = autoflush
|
||||||
|
self.verbose = verbose
|
||||||
|
if os.path.exists(file):
|
||||||
|
self.tell('Loading existing file from {}'.format(file))
|
||||||
|
self.df = pd.read_csv(file, sep='\t')
|
||||||
|
else:
|
||||||
|
self.tell('File {} does not exist. Creating new frame.'.format(file))
|
||||||
|
dir = os.path.dirname(self.file)
|
||||||
|
if dir and not os.path.exists(dir): os.makedirs(dir)
|
||||||
|
self.df = pd.DataFrame(columns=self.columns)
|
||||||
|
|
||||||
|
def already_calculated(self, id):
|
||||||
|
return (self.df['id'] == id).any()
|
||||||
|
|
||||||
|
def add_row(self, method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||||
|
s = pd.Series([method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
||||||
|
self.df = self.df.append(s, ignore_index=True)
|
||||||
|
if self.autoflush: self.flush()
|
||||||
|
self.tell(s.to_string())
|
||||||
|
|
||||||
|
def flush(self):
|
||||||
|
self.df.to_csv(self.file, index=False, sep='\t')
|
||||||
|
|
||||||
|
def tell(self, msg):
|
||||||
|
if self.verbose: print(msg)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
class StandardizeTransformer:
|
class StandardizeTransformer:
|
||||||
|
|
||||||
def __init__(self, axis=0, range=None):
|
def __init__(self, axis=0, range=None):
|
||||||
|
|
Loading…
Reference in New Issue