refactoring
This commit is contained in:
parent
515acae15b
commit
a5322ba227
|
@ -148,7 +148,7 @@ class MonolingualClassifier:
|
|||
if isinstance(self.model, GridSearchCV):
|
||||
self.best_params_ = self.model.best_params_
|
||||
print('best parameters: ', self.best_params_)
|
||||
self.time=time.time()-tinit
|
||||
self.time = time.time()-tinit
|
||||
return self
|
||||
|
||||
def decision_function(self, X):
|
||||
|
|
|
@ -84,9 +84,9 @@ class PosteriorProbabilitiesEmbedder:
|
|||
self.is_training = is_training
|
||||
|
||||
def fit(self, lX, lY, lV=None, called_by_viewgen=False):
|
||||
if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
|
||||
print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
|
||||
return self
|
||||
# if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
|
||||
# print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
|
||||
# return self
|
||||
if not called_by_viewgen:
|
||||
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
|
||||
print('### Posterior Probabilities View Generator (X)')
|
||||
|
@ -96,20 +96,20 @@ class PosteriorProbabilitiesEmbedder:
|
|||
|
||||
def transform(self, lX):
|
||||
# if dir exist, load and return already computed results
|
||||
_endpoint = 'tr' if self.is_training else 'te'
|
||||
_actual_path = self.storing_path + '/' + _endpoint
|
||||
if exists(_actual_path):
|
||||
print('NB: loading pre-computed results!')
|
||||
with open(_actual_path + '/X.pickle', 'rb') as infile:
|
||||
self.is_training = False
|
||||
return pickle.load(infile)
|
||||
# _endpoint = 'tr' if self.is_training else 'te'
|
||||
# _actual_path = self.storing_path + '/' + _endpoint
|
||||
# if exists(_actual_path):
|
||||
# print('NB: loading pre-computed results!')
|
||||
# with open(_actual_path + '/X.pickle', 'rb') as infile:
|
||||
# self.is_training = False
|
||||
# return pickle.load(infile)
|
||||
|
||||
lZ = self.predict_proba(lX)
|
||||
lZ = _normalize(lZ, self.l2)
|
||||
# create dir and dump computed results
|
||||
create_if_not_exist(_actual_path)
|
||||
with open(_actual_path + '/X.pickle', 'wb') as outfile:
|
||||
pickle.dump(lZ, outfile)
|
||||
# create_if_not_exist(_actual_path)
|
||||
# with open(_actual_path + '/X.pickle', 'wb') as outfile:
|
||||
# pickle.dump(lZ, outfile)
|
||||
self.is_training = False
|
||||
return lZ
|
||||
|
||||
|
@ -154,8 +154,7 @@ class MuseEmbedder:
|
|||
MUSE = self.MUSE
|
||||
lX = self.featureweight.transform(lX)
|
||||
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs
|
||||
)
|
||||
delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs)
|
||||
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
|
||||
lMuse = _normalize(lMuse, self.l2)
|
||||
return lMuse
|
||||
|
@ -211,18 +210,22 @@ class WordClassEmbedder:
|
|||
class MBertEmbedder:
|
||||
|
||||
def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None,
|
||||
nC=None):
|
||||
nC=None, avoid_loading=False):
|
||||
self.doc_embed_path = doc_embed_path
|
||||
self.patience = patience
|
||||
self.checkpoint_dir = checkpoint_dir
|
||||
self.fitted = False
|
||||
self.requires_tfidf = False
|
||||
if path_to_model is None and nC is not None:
|
||||
self.avoid_loading = avoid_loading
|
||||
if path_to_model is None:
|
||||
self.model = None
|
||||
else:
|
||||
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
|
||||
num_labels=nC)
|
||||
self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda()
|
||||
if self.avoid_loading:
|
||||
self.model = None
|
||||
else:
|
||||
self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda() # TODO: setting model to None in order to avoid loading it onto gpu if we have already pre-computed results!
|
||||
self.fitted = True
|
||||
|
||||
def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1):
|
||||
|
@ -235,7 +238,7 @@ class MBertEmbedder:
|
|||
l_tokenized_tr = do_tokenization(lX, max_len=512)
|
||||
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly,
|
||||
val_prop=0.2, max_val=2000,
|
||||
seed=seed) # TODO: seed
|
||||
seed=seed)
|
||||
|
||||
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
|
||||
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
|
||||
|
@ -289,7 +292,7 @@ class MBertEmbedder:
|
|||
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
|
||||
feat_dataset = ExtractorDataset(l_tokenized_X)
|
||||
feat_lang_ids = feat_dataset.lang_ids
|
||||
dataloader = DataLoader(feat_dataset, batch_size=64) # TODO reduced batch size in JRC experiments
|
||||
dataloader = DataLoader(feat_dataset, batch_size=64)
|
||||
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
|
||||
return all_batch_embeddings
|
||||
|
||||
|
@ -301,7 +304,7 @@ class RecurrentEmbedder:
|
|||
|
||||
def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3,
|
||||
we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10,
|
||||
test_each=0, checkpoint_dir='../checkpoint', model_path=None):
|
||||
test_each=0, checkpoint_dir='../checkpoint', model_path=None, n_jobs=-1):
|
||||
self.pretrained = pretrained
|
||||
self.supervised = supervised
|
||||
self.concat = concat
|
||||
|
@ -319,6 +322,7 @@ class RecurrentEmbedder:
|
|||
self.options = options
|
||||
self.seed = options.seed
|
||||
self.model_path = model_path
|
||||
self.n_jobs = n_jobs
|
||||
self.is_trained = False
|
||||
|
||||
## INIT MODEL for training
|
||||
|
@ -398,32 +402,33 @@ class RecurrentEmbedder:
|
|||
|
||||
def transform(self, lX, batch_size=64):
|
||||
lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
|
||||
lX = self._get_doc_embeddings(lX)
|
||||
lX = self._get_doc_embeddings(lX, batch_size)
|
||||
return lX
|
||||
|
||||
def fit_transform(self, lX, ly, lV=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def _get_doc_embeddings(self, lX, batch_size=64):
|
||||
def _get_doc_embeddings(self, lX, batch_size):
|
||||
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
|
||||
print('Generating document embeddings via GRU')
|
||||
_lX = {}
|
||||
|
||||
l_devel_target = self.multilingual_index.l_devel_target()
|
||||
|
||||
# show_gpu('RNN init at extraction')
|
||||
for idx, (batch, post, target, lang) in enumerate(batchify(lX, None, l_devel_target,
|
||||
batch_size, self.multilingual_index.l_pad())):
|
||||
if lang not in _lX.keys():
|
||||
_lX[lang] = self.model.get_embeddings(batch, lang)
|
||||
else:
|
||||
_lX[lang] = np.concatenate((_lX[lang], self.model.get_embeddings(batch, lang)), axis=0)
|
||||
|
||||
# show_gpu('RNN after batch pred at extraction')
|
||||
return _lX
|
||||
|
||||
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
|
||||
def _load_pretrained_embeddings(self, we_path, langs):
|
||||
lpretrained = lpretrained_vocabulary = self._none_dict(langs)
|
||||
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
|
||||
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=self.n_jobs)
|
||||
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
|
||||
return lpretrained, lpretrained_vocabulary
|
||||
|
||||
|
@ -553,20 +558,20 @@ class FeatureSet2Posteriors:
|
|||
|
||||
def transform(self, lX):
|
||||
# if dir exist, load and return already computed results
|
||||
_endpoint = 'tr' if self.is_training else 'te'
|
||||
_actual_path = self.storing_path + '/' + _endpoint
|
||||
if exists(_actual_path):
|
||||
print('NB: loading pre-computed results!')
|
||||
with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile:
|
||||
self.is_training = False
|
||||
return pickle.load(infile)
|
||||
# _endpoint = 'tr' if self.is_training else 'te'
|
||||
# _actual_path = self.storing_path + '/' + _endpoint
|
||||
# if exists(_actual_path):
|
||||
# print('NB: loading pre-computed results!')
|
||||
# with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile:
|
||||
# self.is_training = False
|
||||
# return pickle.load(infile)
|
||||
|
||||
lP = self.predict_proba(lX)
|
||||
lP = _normalize(lP, self.l2)
|
||||
# create dir and dump computed results
|
||||
create_if_not_exist(_actual_path)
|
||||
with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile:
|
||||
pickle.dump(lP, outfile)
|
||||
# create_if_not_exist(_actual_path)
|
||||
# with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile:
|
||||
# pickle.dump(lP, outfile)
|
||||
self.is_training = False
|
||||
return lP
|
||||
|
||||
|
@ -637,8 +642,14 @@ class Funnelling:
|
|||
self.meta = meta
|
||||
self.n_jobs = meta.n_jobs
|
||||
|
||||
def fit(self, lX, ly):
|
||||
tfidf_lX = self.vectorizer.fit_transform(lX, ly)
|
||||
def fit(self, lX, ly, target_lang=None):
|
||||
if target_lang is not None:
|
||||
LX = lX.copy()
|
||||
LX.update(target_lang)
|
||||
self.vectorizer.fit(LX)
|
||||
tfidf_lX = self.vectorizer.transform(lX)
|
||||
else:
|
||||
tfidf_lX = self.vectorizer.fit_transform(lX, ly)
|
||||
lV = self.vectorizer.vocabulary()
|
||||
print('## Fitting first-tier learners!')
|
||||
lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX)
|
||||
|
@ -774,6 +785,7 @@ def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, opt
|
|||
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
# show_gpu('RNN init pre-training')
|
||||
loss_history = []
|
||||
model.train()
|
||||
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
|
||||
|
@ -783,6 +795,7 @@ def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, opt
|
|||
clip_gradient(model)
|
||||
optim.step()
|
||||
loss_history.append(loss.item())
|
||||
# show_gpu('RNN after batch prediction')
|
||||
|
||||
if idx % log_interval == 0:
|
||||
interval_loss = np.mean(loss_history[-log_interval:])
|
||||
|
@ -810,7 +823,7 @@ def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tini
|
|||
yte_stacked[lang].append(target.detach().cpu().numpy())
|
||||
loss_history.append(loss)
|
||||
|
||||
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
|
||||
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
|
||||
ly_ = {l:np.vstack(predictions[l]) for l in langs}
|
||||
l_eval = evaluate(ly, ly_)
|
||||
metrics = []
|
||||
|
|
|
@ -13,30 +13,31 @@ if __name__ == '__main__':
|
|||
assert exists(dataset), 'Unable to find file '+str(dataset)
|
||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||
assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \
|
||||
'empty set of document embeddings is not allowed'
|
||||
'empty set of document embeddings is not allowed'
|
||||
if op.gruViewGenerator:
|
||||
assert op.gruWCE or op.gruMUSE, 'Initializing Gated Recurrent embedding layer without ' \
|
||||
'explicit initialization of GRU View Generator'
|
||||
'explicit initialization of GRU View Generator'
|
||||
|
||||
l2 = op.l2
|
||||
dataset_file = os.path.basename(dataset)
|
||||
results = PolylingualClassificationResults('../log/' + op.output)
|
||||
allprob = 'Prob' if op.allprob else ''
|
||||
|
||||
# renaming arguments to be printed on log
|
||||
method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert,
|
||||
op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob)
|
||||
|
||||
print(f'Method: gFun{method_name}\nDataset: {dataset_name}')
|
||||
print('-'*50)
|
||||
|
||||
# set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect
|
||||
n_jobs = -1 # TODO SETTING n_JOBS
|
||||
|
||||
standardize_range = slice(0, 0)
|
||||
if op.zscore:
|
||||
standardize_range = None
|
||||
|
||||
# load dataset
|
||||
data = MultilingualDataset.load(dataset)
|
||||
data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING
|
||||
# data.set_view(languages=['it']) # TODO: DEBUG SETTING
|
||||
data.show_dimensions()
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
@ -63,7 +64,7 @@ if __name__ == '__main__':
|
|||
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
|
||||
kernel='linear',
|
||||
C=op.set_c),
|
||||
l2=l2, storing_path=storing_path))
|
||||
l2=l2, storing_path=storing_path, n_jobs=n_jobs))
|
||||
|
||||
if op.supervised:
|
||||
"""
|
||||
|
@ -73,9 +74,11 @@ if __name__ == '__main__':
|
|||
VG_name = 'W'
|
||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||
exist = exists(storing_path)
|
||||
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting,
|
||||
sif=op.sif, n_jobs=n_jobs)
|
||||
if op.allprob:
|
||||
wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
|
||||
wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path,
|
||||
n_jobs=n_jobs)
|
||||
doc_embedder.append(wce)
|
||||
|
||||
if op.pretrained:
|
||||
|
@ -86,9 +89,10 @@ if __name__ == '__main__':
|
|||
VG_name = 'M'
|
||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||
exist = exists(storing_path)
|
||||
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif, n_jobs=n_jobs)
|
||||
if op.allprob:
|
||||
muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path)
|
||||
muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path,
|
||||
n_jobs=n_jobs)
|
||||
doc_embedder.append(muse)
|
||||
|
||||
if op.gruViewGenerator:
|
||||
|
@ -100,12 +104,12 @@ if __name__ == '__main__':
|
|||
VG_name = 'G'
|
||||
VG_name += '_muse' if op.gruMUSE else ''
|
||||
VG_name += '_wce' if op.gruWCE else ''
|
||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||
storing_path = 'Nope' # f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
|
||||
options=op, model_path=op.gru_path)
|
||||
options=op, model_path=None, n_jobs=n_jobs)
|
||||
if op.allprob:
|
||||
rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False,
|
||||
storing_path=storing_path)
|
||||
storing_path=storing_path, n_jobs=n_jobs)
|
||||
doc_embedder.append(rnn_embedder)
|
||||
|
||||
if op.mbert:
|
||||
|
@ -114,8 +118,9 @@ if __name__ == '__main__':
|
|||
"""
|
||||
VG_name = 'B'
|
||||
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
|
||||
avoid_loading = False if op.avoid_loading else True # TODO research setting (set to false mBert will be loaded into gpu to get doc emebds (aka, only the first time for each run))
|
||||
|
||||
mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories())
|
||||
mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories(), avoid_loading=avoid_loading)
|
||||
if op.allprob:
|
||||
mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path)
|
||||
doc_embedder.append(mbert)
|
||||
|
@ -123,7 +128,7 @@ if __name__ == '__main__':
|
|||
# metaclassifier
|
||||
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c),
|
||||
meta_parameters=get_params(op.optimc), standardize_range=standardize_range)
|
||||
meta_parameters=get_params(op.optimc), standardize_range=standardize_range, n_jobs=n_jobs)
|
||||
|
||||
# ensembling the modules
|
||||
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
|
||||
|
|
|
@ -20,9 +20,6 @@ def predict(logits, classification_type='multilabel'):
|
|||
|
||||
|
||||
class TrainingDataset(Dataset):
|
||||
"""
|
||||
data: dict of lang specific tokenized data
|
||||
"""
|
||||
|
||||
def __init__(self, data, labels):
|
||||
self.langs = data.keys()
|
||||
|
@ -231,7 +228,7 @@ def feature_extractor(data, lang_ids, model):
|
|||
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
|
||||
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
|
||||
"""
|
||||
show_gpu('Before Training')
|
||||
# show_gpu('Before Training')
|
||||
all_batch_embeddings = {}
|
||||
id2lang = {v: k for k, v in lang_ids.items()}
|
||||
with torch.no_grad():
|
||||
|
@ -246,5 +243,5 @@ def feature_extractor(data, lang_ids, model):
|
|||
else:
|
||||
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
|
||||
batch_embeddings[i].detach().cpu().numpy()))
|
||||
show_gpu('After Full Prediction')
|
||||
# show_gpu('After Full Prediction')
|
||||
return all_batch_embeddings, id2lang
|
||||
|
|
|
@ -74,7 +74,7 @@ class Index:
|
|||
self.test_raw = test_raw
|
||||
|
||||
def index(self, pretrained_vocabulary, analyzer, vocabulary):
|
||||
self.word2index = dict(vocabulary)
|
||||
self.word2index = dict(vocabulary) # word2idx
|
||||
known_words = set(self.word2index.keys())
|
||||
if pretrained_vocabulary is not None:
|
||||
known_words.update(pretrained_vocabulary)
|
||||
|
@ -207,44 +207,6 @@ class MultilingualIndex:
|
|||
index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
|
||||
self.sup_range = index.wce_range
|
||||
|
||||
# TODO circular import with transformers --> when generating posterior prob, we import PosteriorProbabilitiesEmbedder which is defined in transformers
|
||||
# def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
|
||||
# # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
|
||||
# timeit = time.time()
|
||||
# lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
|
||||
# lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
|
||||
# if not stored_post:
|
||||
# for l in self.langs:
|
||||
# n_elements = lXtr[l].shape[0]
|
||||
# if n_elements > max_training_docs_by_lang:
|
||||
# choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
|
||||
# lXtr[l] = lXtr[l][choice]
|
||||
# lYtr[l] = lYtr[l][choice]
|
||||
#
|
||||
# # train the posterior probabilities embedder
|
||||
# print('[posteriors] training a calibrated SVM')
|
||||
# learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
|
||||
# prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
|
||||
# prob_embedder.fit(lXtr, lYtr)
|
||||
#
|
||||
# # transforms the training, validation, and test sets into posterior probabilities
|
||||
# print('[posteriors] generating posterior probabilities')
|
||||
# lPtr = prob_embedder.transform(self.get_lXtr())
|
||||
# lPva = prob_embedder.transform(self.get_lXva())
|
||||
# lPte = prob_embedder.transform(self.get_lXte())
|
||||
# # NB: Check splits indices !
|
||||
# if store_posteriors:
|
||||
# import pickle
|
||||
# with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
|
||||
# pickle.dump([lPtr, lPva, lPte], outfile)
|
||||
# print(f'Successfully dumped posteriors!')
|
||||
# else:
|
||||
# import pickle
|
||||
# with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
|
||||
# lPtr, lPva, lPte = pickle.load(infile)
|
||||
# print(f'Successfully loaded stored posteriors!')
|
||||
# print(f'[posteriors] done in {time.time() - timeit}')
|
||||
# return lPtr, lPva, lPte
|
||||
|
||||
def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False):
|
||||
show_gpu('GPU memory before initializing mBert model:')
|
||||
|
@ -518,10 +480,12 @@ class TfidfVectorizerMultilingual:
|
|||
def fit(self, lX, ly=None):
|
||||
self.langs = sorted(lX.keys())
|
||||
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
|
||||
# self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in lX.keys()}
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
|
||||
# return {l: self.vectorizer[l].transform(lX[l]) for l in lX.keys()}
|
||||
|
||||
def fit_transform(self, lX, ly=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
@ -568,3 +532,11 @@ def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru,
|
|||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
return _id, dataset_id
|
||||
|
||||
|
||||
def get_zscl_setting(langs):
|
||||
settings = []
|
||||
for elem in langs:
|
||||
for tar in langs:
|
||||
if elem != tar:
|
||||
settings.append((elem, tar))
|
||||
return settings
|
|
@ -1,5 +1,4 @@
|
|||
import numpy as np
|
||||
import numpy as np
|
||||
from scipy.sparse import lil_matrix, issparse
|
||||
from sklearn.metrics import f1_score, accuracy_score
|
||||
|
||||
|
|
|
@ -60,6 +60,9 @@ parser.add_option("-a", "--agg", dest="agg", action='store_true',
|
|||
help="Set aggregation function of the common Z-space to average (Default: concatenation)",
|
||||
default=True)
|
||||
|
||||
parser.add_option("-l", dest="avoid_loading", action="store_true",
|
||||
help="TODO", default=False)
|
||||
|
||||
# ------------------------------------------------------------------------------------
|
||||
|
||||
parser.add_option('--hidden', type=int, default=512, metavar='int',
|
||||
|
|
|
@ -47,3 +47,46 @@ class PolylingualClassificationResults:
|
|||
|
||||
def tell(self, msg):
|
||||
if self.verbose: print(msg)
|
||||
|
||||
|
||||
class ZSCLResults:
|
||||
def __init__(self, file, autoflush=True, verbose=False):
|
||||
self.file = file
|
||||
self.columns = ['method',
|
||||
'optimp',
|
||||
'source',
|
||||
'target',
|
||||
'id',
|
||||
'dataset',
|
||||
'time',
|
||||
'lang',
|
||||
'macrof1',
|
||||
'microf1',
|
||||
'macrok',
|
||||
'microk',
|
||||
'notes']
|
||||
self.autoflush = autoflush
|
||||
self.verbose = verbose
|
||||
if os.path.exists(file):
|
||||
self.tell('Loading existing file from {}'.format(file))
|
||||
self.df = pd.read_csv(file, sep='\t')
|
||||
else:
|
||||
self.tell('File {} does not exist. Creating new frame.'.format(file))
|
||||
dir = os.path.dirname(self.file)
|
||||
if dir and not os.path.exists(dir): os.makedirs(dir)
|
||||
self.df = pd.DataFrame(columns=self.columns)
|
||||
|
||||
def already_calculated(self, id):
|
||||
return (self.df['id'] == id).any()
|
||||
|
||||
def add_row(self, method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||
s = pd.Series([method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
||||
self.df = self.df.append(s, ignore_index=True)
|
||||
if self.autoflush: self.flush()
|
||||
self.tell(s.to_string())
|
||||
|
||||
def flush(self):
|
||||
self.df.to_csv(self.file, index=False, sep='\t')
|
||||
|
||||
def tell(self, msg):
|
||||
if self.verbose: print(msg)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
class StandardizeTransformer:
|
||||
|
||||
def __init__(self, axis=0, range=None):
|
||||
|
|
Loading…
Reference in New Issue