refactoring

This commit is contained in:
andrea 2021-01-19 10:31:57 +01:00
parent 515acae15b
commit a5322ba227
9 changed files with 132 additions and 99 deletions

View File

@ -148,7 +148,7 @@ class MonolingualClassifier:
if isinstance(self.model, GridSearchCV): if isinstance(self.model, GridSearchCV):
self.best_params_ = self.model.best_params_ self.best_params_ = self.model.best_params_
print('best parameters: ', self.best_params_) print('best parameters: ', self.best_params_)
self.time=time.time()-tinit self.time = time.time()-tinit
return self return self
def decision_function(self, X): def decision_function(self, X):

View File

@ -84,9 +84,9 @@ class PosteriorProbabilitiesEmbedder:
self.is_training = is_training self.is_training = is_training
def fit(self, lX, lY, lV=None, called_by_viewgen=False): def fit(self, lX, lY, lV=None, called_by_viewgen=False):
if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'): # if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results') # print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
return self # return self
if not called_by_viewgen: if not called_by_viewgen:
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen) # Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
print('### Posterior Probabilities View Generator (X)') print('### Posterior Probabilities View Generator (X)')
@ -96,20 +96,20 @@ class PosteriorProbabilitiesEmbedder:
def transform(self, lX): def transform(self, lX):
# if dir exist, load and return already computed results # if dir exist, load and return already computed results
_endpoint = 'tr' if self.is_training else 'te' # _endpoint = 'tr' if self.is_training else 'te'
_actual_path = self.storing_path + '/' + _endpoint # _actual_path = self.storing_path + '/' + _endpoint
if exists(_actual_path): # if exists(_actual_path):
print('NB: loading pre-computed results!') # print('NB: loading pre-computed results!')
with open(_actual_path + '/X.pickle', 'rb') as infile: # with open(_actual_path + '/X.pickle', 'rb') as infile:
self.is_training = False # self.is_training = False
return pickle.load(infile) # return pickle.load(infile)
lZ = self.predict_proba(lX) lZ = self.predict_proba(lX)
lZ = _normalize(lZ, self.l2) lZ = _normalize(lZ, self.l2)
# create dir and dump computed results # create dir and dump computed results
create_if_not_exist(_actual_path) # create_if_not_exist(_actual_path)
with open(_actual_path + '/X.pickle', 'wb') as outfile: # with open(_actual_path + '/X.pickle', 'wb') as outfile:
pickle.dump(lZ, outfile) # pickle.dump(lZ, outfile)
self.is_training = False self.is_training = False
return lZ return lZ
@ -154,8 +154,7 @@ class MuseEmbedder:
MUSE = self.MUSE MUSE = self.MUSE
lX = self.featureweight.transform(lX) lX = self.featureweight.transform(lX)
XdotMUSE = Parallel(n_jobs=self.n_jobs)( XdotMUSE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs)
)
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)} lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
lMuse = _normalize(lMuse, self.l2) lMuse = _normalize(lMuse, self.l2)
return lMuse return lMuse
@ -211,18 +210,22 @@ class WordClassEmbedder:
class MBertEmbedder: class MBertEmbedder:
def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None, def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None,
nC=None): nC=None, avoid_loading=False):
self.doc_embed_path = doc_embed_path self.doc_embed_path = doc_embed_path
self.patience = patience self.patience = patience
self.checkpoint_dir = checkpoint_dir self.checkpoint_dir = checkpoint_dir
self.fitted = False self.fitted = False
self.requires_tfidf = False self.requires_tfidf = False
if path_to_model is None and nC is not None: self.avoid_loading = avoid_loading
if path_to_model is None:
self.model = None self.model = None
else: else:
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
num_labels=nC) num_labels=nC)
self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda() if self.avoid_loading:
self.model = None
else:
self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda() # TODO: setting model to None in order to avoid loading it onto gpu if we have already pre-computed results!
self.fitted = True self.fitted = True
def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1): def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1):
@ -235,7 +238,7 @@ class MBertEmbedder:
l_tokenized_tr = do_tokenization(lX, max_len=512) l_tokenized_tr = do_tokenization(lX, max_len=512)
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly, l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly,
val_prop=0.2, max_val=2000, val_prop=0.2, max_val=2000,
seed=seed) # TODO: seed seed=seed)
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
va_dataset = TrainingDataset(l_split_va, l_split_val_target) va_dataset = TrainingDataset(l_split_va, l_split_val_target)
@ -289,7 +292,7 @@ class MBertEmbedder:
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True) l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
feat_dataset = ExtractorDataset(l_tokenized_X) feat_dataset = ExtractorDataset(l_tokenized_X)
feat_lang_ids = feat_dataset.lang_ids feat_lang_ids = feat_dataset.lang_ids
dataloader = DataLoader(feat_dataset, batch_size=64) # TODO reduced batch size in JRC experiments dataloader = DataLoader(feat_dataset, batch_size=64)
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model) all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
return all_batch_embeddings return all_batch_embeddings
@ -301,7 +304,7 @@ class RecurrentEmbedder:
def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3, def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3,
we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10, we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10,
test_each=0, checkpoint_dir='../checkpoint', model_path=None): test_each=0, checkpoint_dir='../checkpoint', model_path=None, n_jobs=-1):
self.pretrained = pretrained self.pretrained = pretrained
self.supervised = supervised self.supervised = supervised
self.concat = concat self.concat = concat
@ -319,6 +322,7 @@ class RecurrentEmbedder:
self.options = options self.options = options
self.seed = options.seed self.seed = options.seed
self.model_path = model_path self.model_path = model_path
self.n_jobs = n_jobs
self.is_trained = False self.is_trained = False
## INIT MODEL for training ## INIT MODEL for training
@ -398,32 +402,33 @@ class RecurrentEmbedder:
def transform(self, lX, batch_size=64): def transform(self, lX, batch_size=64):
lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary) lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
lX = self._get_doc_embeddings(lX) lX = self._get_doc_embeddings(lX, batch_size)
return lX return lX
def fit_transform(self, lX, ly, lV=None): def fit_transform(self, lX, ly, lV=None):
return self.fit(lX, ly).transform(lX) return self.fit(lX, ly).transform(lX)
def _get_doc_embeddings(self, lX, batch_size=64): def _get_doc_embeddings(self, lX, batch_size):
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!' assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
print('Generating document embeddings via GRU') print('Generating document embeddings via GRU')
_lX = {} _lX = {}
l_devel_target = self.multilingual_index.l_devel_target() l_devel_target = self.multilingual_index.l_devel_target()
# show_gpu('RNN init at extraction')
for idx, (batch, post, target, lang) in enumerate(batchify(lX, None, l_devel_target, for idx, (batch, post, target, lang) in enumerate(batchify(lX, None, l_devel_target,
batch_size, self.multilingual_index.l_pad())): batch_size, self.multilingual_index.l_pad())):
if lang not in _lX.keys(): if lang not in _lX.keys():
_lX[lang] = self.model.get_embeddings(batch, lang) _lX[lang] = self.model.get_embeddings(batch, lang)
else: else:
_lX[lang] = np.concatenate((_lX[lang], self.model.get_embeddings(batch, lang)), axis=0) _lX[lang] = np.concatenate((_lX[lang], self.model.get_embeddings(batch, lang)), axis=0)
# show_gpu('RNN after batch pred at extraction')
return _lX return _lX
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise # loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
def _load_pretrained_embeddings(self, we_path, langs): def _load_pretrained_embeddings(self, we_path, langs):
lpretrained = lpretrained_vocabulary = self._none_dict(langs) lpretrained = lpretrained_vocabulary = self._none_dict(langs)
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1) lpretrained = load_muse_embeddings(we_path, langs, n_jobs=self.n_jobs)
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
return lpretrained, lpretrained_vocabulary return lpretrained, lpretrained_vocabulary
@ -553,20 +558,20 @@ class FeatureSet2Posteriors:
def transform(self, lX): def transform(self, lX):
# if dir exist, load and return already computed results # if dir exist, load and return already computed results
_endpoint = 'tr' if self.is_training else 'te' # _endpoint = 'tr' if self.is_training else 'te'
_actual_path = self.storing_path + '/' + _endpoint # _actual_path = self.storing_path + '/' + _endpoint
if exists(_actual_path): # if exists(_actual_path):
print('NB: loading pre-computed results!') # print('NB: loading pre-computed results!')
with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile: # with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile:
self.is_training = False # self.is_training = False
return pickle.load(infile) # return pickle.load(infile)
lP = self.predict_proba(lX) lP = self.predict_proba(lX)
lP = _normalize(lP, self.l2) lP = _normalize(lP, self.l2)
# create dir and dump computed results # create dir and dump computed results
create_if_not_exist(_actual_path) # create_if_not_exist(_actual_path)
with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile: # with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile:
pickle.dump(lP, outfile) # pickle.dump(lP, outfile)
self.is_training = False self.is_training = False
return lP return lP
@ -637,7 +642,13 @@ class Funnelling:
self.meta = meta self.meta = meta
self.n_jobs = meta.n_jobs self.n_jobs = meta.n_jobs
def fit(self, lX, ly): def fit(self, lX, ly, target_lang=None):
if target_lang is not None:
LX = lX.copy()
LX.update(target_lang)
self.vectorizer.fit(LX)
tfidf_lX = self.vectorizer.transform(lX)
else:
tfidf_lX = self.vectorizer.fit_transform(lX, ly) tfidf_lX = self.vectorizer.fit_transform(lX, ly)
lV = self.vectorizer.vocabulary() lV = self.vectorizer.vocabulary()
print('## Fitting first-tier learners!') print('## Fitting first-tier learners!')
@ -774,6 +785,7 @@ def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, opt
_dataset_path = opt.dataset.split('/')[-1].split('_') _dataset_path = opt.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1] dataset_id = _dataset_path[0] + _dataset_path[-1]
# show_gpu('RNN init pre-training')
loss_history = [] loss_history = []
model.train() model.train()
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)): for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
@ -783,6 +795,7 @@ def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, opt
clip_gradient(model) clip_gradient(model)
optim.step() optim.step()
loss_history.append(loss.item()) loss_history.append(loss.item())
# show_gpu('RNN after batch prediction')
if idx % log_interval == 0: if idx % log_interval == 0:
interval_loss = np.mean(loss_history[-log_interval:]) interval_loss = np.mean(loss_history[-log_interval:])

View File

@ -23,20 +23,21 @@ if __name__ == '__main__':
results = PolylingualClassificationResults('../log/' + op.output) results = PolylingualClassificationResults('../log/' + op.output)
allprob = 'Prob' if op.allprob else '' allprob = 'Prob' if op.allprob else ''
# renaming arguments to be printed on log
method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert, method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert,
op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob) op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob)
print(f'Method: gFun{method_name}\nDataset: {dataset_name}') print(f'Method: gFun{method_name}\nDataset: {dataset_name}')
print('-'*50) print('-'*50)
# set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect n_jobs = -1 # TODO SETTING n_JOBS
standardize_range = slice(0, 0) standardize_range = slice(0, 0)
if op.zscore: if op.zscore:
standardize_range = None standardize_range = None
# load dataset # load dataset
data = MultilingualDataset.load(dataset) data = MultilingualDataset.load(dataset)
data.set_view(languages=['nl', 'it']) # TODO: DEBUG SETTING # data.set_view(languages=['it']) # TODO: DEBUG SETTING
data.show_dimensions() data.show_dimensions()
lXtr, lytr = data.training() lXtr, lytr = data.training()
lXte, lyte = data.test() lXte, lyte = data.test()
@ -63,7 +64,7 @@ if __name__ == '__main__':
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
kernel='linear', kernel='linear',
C=op.set_c), C=op.set_c),
l2=l2, storing_path=storing_path)) l2=l2, storing_path=storing_path, n_jobs=n_jobs))
if op.supervised: if op.supervised:
""" """
@ -73,9 +74,11 @@ if __name__ == '__main__':
VG_name = 'W' VG_name = 'W'
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
exist = exists(storing_path) exist = exists(storing_path)
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif) wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting,
sif=op.sif, n_jobs=n_jobs)
if op.allprob: if op.allprob:
wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path) wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path,
n_jobs=n_jobs)
doc_embedder.append(wce) doc_embedder.append(wce)
if op.pretrained: if op.pretrained:
@ -86,9 +89,10 @@ if __name__ == '__main__':
VG_name = 'M' VG_name = 'M'
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
exist = exists(storing_path) exist = exists(storing_path)
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif) muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif, n_jobs=n_jobs)
if op.allprob: if op.allprob:
muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path) muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path,
n_jobs=n_jobs)
doc_embedder.append(muse) doc_embedder.append(muse)
if op.gruViewGenerator: if op.gruViewGenerator:
@ -100,12 +104,12 @@ if __name__ == '__main__':
VG_name = 'G' VG_name = 'G'
VG_name += '_muse' if op.gruMUSE else '' VG_name += '_muse' if op.gruMUSE else ''
VG_name += '_wce' if op.gruWCE else '' VG_name += '_wce' if op.gruWCE else ''
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' storing_path = 'Nope' # f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data, rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
options=op, model_path=op.gru_path) options=op, model_path=None, n_jobs=n_jobs)
if op.allprob: if op.allprob:
rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False, rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False,
storing_path=storing_path) storing_path=storing_path, n_jobs=n_jobs)
doc_embedder.append(rnn_embedder) doc_embedder.append(rnn_embedder)
if op.mbert: if op.mbert:
@ -114,8 +118,9 @@ if __name__ == '__main__':
""" """
VG_name = 'B' VG_name = 'B'
storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
avoid_loading = False if op.avoid_loading else True # TODO research setting (set to false mBert will be loaded into gpu to get doc emebds (aka, only the first time for each run))
mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories()) mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories(), avoid_loading=avoid_loading)
if op.allprob: if op.allprob:
mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path) mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path)
doc_embedder.append(mbert) doc_embedder.append(mbert)
@ -123,7 +128,7 @@ if __name__ == '__main__':
# metaclassifier # metaclassifier
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c), meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c),
meta_parameters=get_params(op.optimc), standardize_range=standardize_range) meta_parameters=get_params(op.optimc), standardize_range=standardize_range, n_jobs=n_jobs)
# ensembling the modules # ensembling the modules
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)

View File

@ -20,9 +20,6 @@ def predict(logits, classification_type='multilabel'):
class TrainingDataset(Dataset): class TrainingDataset(Dataset):
"""
data: dict of lang specific tokenized data
"""
def __init__(self, data, labels): def __init__(self, data, labels):
self.langs = data.keys() self.langs = data.keys()
@ -231,7 +228,7 @@ def feature_extractor(data, lang_ids, model):
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
the output of each layer) of shape (batch_size, sequence_length, hidden_size) the output of each layer) of shape (batch_size, sequence_length, hidden_size)
""" """
show_gpu('Before Training') # show_gpu('Before Training')
all_batch_embeddings = {} all_batch_embeddings = {}
id2lang = {v: k for k, v in lang_ids.items()} id2lang = {v: k for k, v in lang_ids.items()}
with torch.no_grad(): with torch.no_grad():
@ -246,5 +243,5 @@ def feature_extractor(data, lang_ids, model):
else: else:
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
batch_embeddings[i].detach().cpu().numpy())) batch_embeddings[i].detach().cpu().numpy()))
show_gpu('After Full Prediction') # show_gpu('After Full Prediction')
return all_batch_embeddings, id2lang return all_batch_embeddings, id2lang

View File

@ -74,7 +74,7 @@ class Index:
self.test_raw = test_raw self.test_raw = test_raw
def index(self, pretrained_vocabulary, analyzer, vocabulary): def index(self, pretrained_vocabulary, analyzer, vocabulary):
self.word2index = dict(vocabulary) self.word2index = dict(vocabulary) # word2idx
known_words = set(self.word2index.keys()) known_words = set(self.word2index.keys())
if pretrained_vocabulary is not None: if pretrained_vocabulary is not None:
known_words.update(pretrained_vocabulary) known_words.update(pretrained_vocabulary)
@ -207,44 +207,6 @@ class MultilingualIndex:
index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l]) index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
self.sup_range = index.wce_range self.sup_range = index.wce_range
# TODO circular import with transformers --> when generating posterior prob, we import PosteriorProbabilitiesEmbedder which is defined in transformers
# def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
# # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
# timeit = time.time()
# lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
# lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
# if not stored_post:
# for l in self.langs:
# n_elements = lXtr[l].shape[0]
# if n_elements > max_training_docs_by_lang:
# choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
# lXtr[l] = lXtr[l][choice]
# lYtr[l] = lYtr[l][choice]
#
# # train the posterior probabilities embedder
# print('[posteriors] training a calibrated SVM')
# learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
# prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
# prob_embedder.fit(lXtr, lYtr)
#
# # transforms the training, validation, and test sets into posterior probabilities
# print('[posteriors] generating posterior probabilities')
# lPtr = prob_embedder.transform(self.get_lXtr())
# lPva = prob_embedder.transform(self.get_lXva())
# lPte = prob_embedder.transform(self.get_lXte())
# # NB: Check splits indices !
# if store_posteriors:
# import pickle
# with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
# pickle.dump([lPtr, lPva, lPte], outfile)
# print(f'Successfully dumped posteriors!')
# else:
# import pickle
# with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
# lPtr, lPva, lPte = pickle.load(infile)
# print(f'Successfully loaded stored posteriors!')
# print(f'[posteriors] done in {time.time() - timeit}')
# return lPtr, lPva, lPte
def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False): def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False):
show_gpu('GPU memory before initializing mBert model:') show_gpu('GPU memory before initializing mBert model:')
@ -518,10 +480,12 @@ class TfidfVectorizerMultilingual:
def fit(self, lX, ly=None): def fit(self, lX, ly=None):
self.langs = sorted(lX.keys()) self.langs = sorted(lX.keys())
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
# self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in lX.keys()}
return self return self
def transform(self, lX): def transform(self, lX):
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
# return {l: self.vectorizer[l].transform(lX[l]) for l in lX.keys()}
def fit_transform(self, lX, ly=None): def fit_transform(self, lX, ly=None):
return self.fit(lX, ly).transform(lX) return self.fit(lX, ly).transform(lX)
@ -568,3 +532,11 @@ def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru,
dataset_id = _dataset_path[0] + _dataset_path[-1] dataset_id = _dataset_path[0] + _dataset_path[-1]
return _id, dataset_id return _id, dataset_id
def get_zscl_setting(langs):
settings = []
for elem in langs:
for tar in langs:
if elem != tar:
settings.append((elem, tar))
return settings

View File

@ -1,5 +1,4 @@
import numpy as np import numpy as np
import numpy as np
from scipy.sparse import lil_matrix, issparse from scipy.sparse import lil_matrix, issparse
from sklearn.metrics import f1_score, accuracy_score from sklearn.metrics import f1_score, accuracy_score

View File

@ -60,6 +60,9 @@ parser.add_option("-a", "--agg", dest="agg", action='store_true',
help="Set aggregation function of the common Z-space to average (Default: concatenation)", help="Set aggregation function of the common Z-space to average (Default: concatenation)",
default=True) default=True)
parser.add_option("-l", dest="avoid_loading", action="store_true",
help="TODO", default=False)
# ------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------
parser.add_option('--hidden', type=int, default=512, metavar='int', parser.add_option('--hidden', type=int, default=512, metavar='int',

View File

@ -47,3 +47,46 @@ class PolylingualClassificationResults:
def tell(self, msg): def tell(self, msg):
if self.verbose: print(msg) if self.verbose: print(msg)
class ZSCLResults:
def __init__(self, file, autoflush=True, verbose=False):
self.file = file
self.columns = ['method',
'optimp',
'source',
'target',
'id',
'dataset',
'time',
'lang',
'macrof1',
'microf1',
'macrok',
'microk',
'notes']
self.autoflush = autoflush
self.verbose = verbose
if os.path.exists(file):
self.tell('Loading existing file from {}'.format(file))
self.df = pd.read_csv(file, sep='\t')
else:
self.tell('File {} does not exist. Creating new frame.'.format(file))
dir = os.path.dirname(self.file)
if dir and not os.path.exists(dir): os.makedirs(dir)
self.df = pd.DataFrame(columns=self.columns)
def already_calculated(self, id):
return (self.df['id'] == id).any()
def add_row(self, method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
s = pd.Series([method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
self.df = self.df.append(s, ignore_index=True)
if self.autoflush: self.flush()
self.tell(s.to_string())
def flush(self):
self.df.to_csv(self.file, index=False, sep='\t')
def tell(self, msg):
if self.verbose: print(msg)

View File

@ -1,5 +1,6 @@
import numpy as np import numpy as np
class StandardizeTransformer: class StandardizeTransformer:
def __init__(self, axis=0, range=None): def __init__(self, axis=0, range=None):