First commit
This commit is contained in:
parent
4a7a594a41
commit
90f24dab8e
|
|
@ -121,7 +121,6 @@ class MultilingualDataset:
|
|||
print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape))
|
||||
|
||||
def show_category_prevalences(self):
|
||||
#pass
|
||||
nC = self.num_categories()
|
||||
accum_tr = np.zeros(nC, dtype=np.int)
|
||||
accum_te = np.zeros(nC, dtype=np.int)
|
||||
|
|
|
|||
|
|
@ -47,7 +47,6 @@ class FastTextWikiNews(Vectors):
|
|||
class FastTextMUSE(PretrainedEmbeddings):
|
||||
def __init__(self, path, lang, limit=None):
|
||||
super().__init__()
|
||||
print(f'Loading fastText pretrained vectors for language {lang} from {path}')
|
||||
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
|
||||
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,14 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run#
|
||||
|
||||
runs='1 2 3 4 5 6 7 8 9'
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
modelpath=/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run$runs
|
||||
python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath
|
||||
done
|
||||
|
||||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
|
||||
python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath
|
||||
|
|
@ -133,7 +133,8 @@ class MonolingualClassifier:
|
|||
self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs)
|
||||
else:
|
||||
self.model = self.learner
|
||||
raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in the labels across languages')
|
||||
raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in '
|
||||
'the labels across languages')
|
||||
|
||||
# parameter optimization?
|
||||
if self.parameters:
|
||||
|
|
@ -141,7 +142,8 @@ class MonolingualClassifier:
|
|||
self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs,
|
||||
error_score=0, verbose=10)
|
||||
|
||||
print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}')
|
||||
# print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}')
|
||||
print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}')
|
||||
self.model.fit(X, y)
|
||||
if isinstance(self.model, GridSearchCV):
|
||||
self.best_params_ = self.model.best_params_
|
||||
|
|
|
|||
|
|
@ -1,60 +1,34 @@
|
|||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
#from data.text_preprocessor import NLTKStemTokenizer
|
||||
from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain, \
|
||||
gain_ratio, gss
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
from torch.utils.data import DataLoader
|
||||
from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain
|
||||
from embeddings.embeddings import FastTextMUSE
|
||||
from embeddings.supervised import supervised_embeddings_tfidf, zscores
|
||||
from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling
|
||||
import time
|
||||
from sklearn.decomposition import PCA
|
||||
from joblib import Parallel, delayed
|
||||
from scipy.sparse import issparse, vstack, hstack
|
||||
from scipy.sparse import hstack
|
||||
from util_transformers.StandardizeTransformer import StandardizeTransformer
|
||||
from util.SIF_embed import remove_pc
|
||||
from sklearn.preprocessing import normalize
|
||||
from sklearn.svm import SVC
|
||||
from scipy.sparse import csr_matrix
|
||||
from models.mBert import *
|
||||
from models.lstm_class import *
|
||||
from util.csv_log import CSVLog
|
||||
from util.file import get_file_name
|
||||
from util.early_stop import EarlyStopping
|
||||
from util.common import *
|
||||
import time
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Data Processing
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
class TfidfVectorizerMultilingual:
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.kwargs=kwargs
|
||||
|
||||
def fit(self, lX, ly=None):
|
||||
self.langs = sorted(lX.keys())
|
||||
self.vectorizer={l:TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
|
||||
# tokenizer=NLTKStemTokenizer(l, verbose=True),
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
return {l:self.vectorizer[l].transform(lX[l]) for l in self.langs}
|
||||
|
||||
def fit_transform(self, lX, ly=None):
|
||||
return self.fit(lX,ly).transform(lX)
|
||||
|
||||
def vocabulary(self, l=None):
|
||||
if l is None:
|
||||
return {l:self.vectorizer[l].vocabulary_ for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].vocabulary_
|
||||
|
||||
def get_analyzer(self, l=None):
|
||||
if l is None:
|
||||
return {l:self.vectorizer[l].build_analyzer() for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].build_analyzer()
|
||||
|
||||
|
||||
class FeatureWeight:
|
||||
|
||||
def __init__(self, weight='tfidf', agg='mean'):
|
||||
assert weight in ['tfidf', 'pmi', 'ig'] or callable(weight), 'weight should either be "tfidf" or a callable function'
|
||||
assert weight in ['tfidf', 'pmi', 'ig'] or callable(
|
||||
weight), 'weight should either be "tfidf" or a callable function'
|
||||
assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"'
|
||||
self.weight = weight
|
||||
self.agg = agg
|
||||
|
|
@ -91,8 +65,10 @@ class FeatureWeight:
|
|||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Document Embeddings
|
||||
# View Generators (aka first-tier learners)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
class PosteriorProbabilitiesEmbedder:
|
||||
|
||||
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1):
|
||||
|
|
@ -103,8 +79,12 @@ class PosteriorProbabilitiesEmbedder:
|
|||
self.doc_projector = NaivePolylingualClassifier(
|
||||
self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
|
||||
)
|
||||
self.requires_tfidf = True
|
||||
|
||||
def fit(self, lX, lY, lV=None):
|
||||
def fit(self, lX, lY, lV=None, called_by_viewgen=False):
|
||||
if not called_by_viewgen:
|
||||
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
|
||||
print('### Posterior Probabilities View Generator (X)')
|
||||
print('fitting the projectors... {}'.format(lX.keys()))
|
||||
self.doc_projector.fit(lX, lY)
|
||||
return self
|
||||
|
|
@ -124,7 +104,7 @@ class PosteriorProbabilitiesEmbedder:
|
|||
return self.doc_projector.predict(lX)
|
||||
|
||||
def predict_proba(self, lX, ly=None):
|
||||
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents')
|
||||
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents')
|
||||
return self.doc_projector.predict_proba(lX)
|
||||
|
||||
def _get_output_dim(self):
|
||||
|
|
@ -140,9 +120,12 @@ class MuseEmbedder:
|
|||
self.n_jobs = n_jobs
|
||||
self.featureweight = featureweight
|
||||
self.sif = sif
|
||||
self.requires_tfidf = True
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
assert lV is not None or self.lV is not None, 'lV not specified'
|
||||
print('### MUSE View Generator (M)')
|
||||
print(f'Loading fastText pretrained vectors for languages {list(lX.keys())}...')
|
||||
self.langs = sorted(lX.keys())
|
||||
self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
|
||||
lWordList = {l: self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
|
||||
|
|
@ -178,8 +161,11 @@ class WordClassEmbedder:
|
|||
self.max_label_space = max_label_space
|
||||
self.featureweight = featureweight
|
||||
self.sif = sif
|
||||
self.requires_tfidf = True
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
print('### WCE View Generator (M)')
|
||||
print('Computing supervised embeddings...')
|
||||
self.langs = sorted(lX.keys())
|
||||
WCE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs
|
||||
|
|
@ -202,31 +188,284 @@ class WordClassEmbedder:
|
|||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def _get_output_dim(self):
|
||||
return 73
|
||||
return 73 # TODO !
|
||||
|
||||
|
||||
class MBertEmbedder:
|
||||
|
||||
def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None,
|
||||
nC=None):
|
||||
self.doc_embed_path = doc_embed_path
|
||||
self.patience = patience
|
||||
self.checkpoint_dir = checkpoint_dir
|
||||
self.fitted = False
|
||||
self.requires_tfidf = False
|
||||
if path_to_model is None and nC is not None:
|
||||
self.model = None
|
||||
else:
|
||||
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
|
||||
num_labels=nC)
|
||||
self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda()
|
||||
self.fitted = True
|
||||
|
||||
def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1):
|
||||
print('### mBERT View Generator (B)')
|
||||
if self.fitted is True:
|
||||
print('Bert model already fitted!')
|
||||
return self
|
||||
|
||||
print('Fine-tune mBert on the given dataset.')
|
||||
l_tokenized_tr = do_tokenization(lX, max_len=512)
|
||||
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly,
|
||||
val_prop=0.2, max_val=2000,
|
||||
seed=seed) # TODO: seed
|
||||
|
||||
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
|
||||
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
|
||||
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
|
||||
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
|
||||
|
||||
nC = tr_dataset.get_nclasses()
|
||||
model = get_model(nC)
|
||||
model = model.cuda()
|
||||
criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||
optim = init_optimizer(model, lr=lr, weight_decay=0.01)
|
||||
lr_scheduler = StepLR(optim, step_size=25, gamma=0.1)
|
||||
early_stop = EarlyStopping(model, optimizer=optim, patience=self.patience,
|
||||
checkpoint=self.checkpoint_dir,
|
||||
is_bert=True)
|
||||
|
||||
# Training loop
|
||||
logfile = '../log/log_mBert_extractor.csv'
|
||||
method_name = 'mBert_feature_extractor'
|
||||
|
||||
tinit = time()
|
||||
lang_ids = va_dataset.lang_ids
|
||||
for epoch in range(1, nepochs + 1):
|
||||
print('# Start Training ...')
|
||||
train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile)
|
||||
lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
|
||||
|
||||
# Validation
|
||||
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
|
||||
early_stop(macrof1, epoch)
|
||||
|
||||
if early_stop.STOP:
|
||||
print('[early-stop] STOP')
|
||||
break
|
||||
|
||||
model = early_stop.restore_checkpoint()
|
||||
self.model = model.cuda()
|
||||
|
||||
if val_epochs > 0:
|
||||
print(f'running last {val_epochs} training epochs on the validation set')
|
||||
for val_epoch in range(1, val_epochs + 1):
|
||||
train(self.model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile)
|
||||
|
||||
self.fitted = True
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
assert self.fitted is True, 'Calling transform without any initialized model! - call init first or on init' \
|
||||
'pass the "path_to_model" arg.'
|
||||
print('Obtaining document embeddings from pretrained mBert ')
|
||||
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
|
||||
feat_dataset = ExtractorDataset(l_tokenized_X)
|
||||
feat_lang_ids = feat_dataset.lang_ids
|
||||
dataloader = DataLoader(feat_dataset, batch_size=64)
|
||||
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
|
||||
return all_batch_embeddings
|
||||
|
||||
def fit_transform(self, lX, ly, lV=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
class RecurrentEmbedder:
|
||||
|
||||
def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3,
|
||||
we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10,
|
||||
test_each=0, checkpoint_dir='../checkpoint', model_path=None):
|
||||
self.pretrained = pretrained
|
||||
self.supervised = supervised
|
||||
self.concat = concat
|
||||
self.requires_tfidf = False
|
||||
self.multilingual_dataset = multilingual_dataset
|
||||
self.model = None
|
||||
self.we_path = we_path
|
||||
self.langs = multilingual_dataset.langs()
|
||||
self.hidden_size = hidden_size
|
||||
self.sup_drop = sup_drop
|
||||
self.posteriors = posteriors
|
||||
self.patience = patience
|
||||
self.checkpoint_dir = checkpoint_dir
|
||||
self.test_each = test_each
|
||||
self.options = options
|
||||
self.seed = options.seed
|
||||
self.is_trained = False
|
||||
|
||||
## INIT MODEL for training
|
||||
self.lXtr, self.lytr = self.multilingual_dataset.training(target_as_csr=True)
|
||||
self.lXte, self.lyte = self.multilingual_dataset.test(target_as_csr=True)
|
||||
self.nC = self.lyte[self.langs[0]].shape[1]
|
||||
lpretrained, lpretrained_vocabulary = self._load_pretrained_embeddings(self.we_path, self.langs)
|
||||
self.multilingual_index = MultilingualIndex()
|
||||
self.multilingual_index.index(self.lXtr, self.lytr, self.lXte, lpretrained_vocabulary)
|
||||
self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed)
|
||||
self.multilingual_index.embedding_matrices(lpretrained, self.supervised)
|
||||
|
||||
if model_path is not None:
|
||||
self.is_trained = True
|
||||
self.model = torch.load(model_path)
|
||||
else:
|
||||
self.model = self._init_Net()
|
||||
|
||||
self.optim = init_optimizer(self.model, lr=lr)
|
||||
self.criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||
self.lr_scheduler = StepLR(self.optim, step_size=25, gamma=0.5)
|
||||
self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience,
|
||||
checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}')
|
||||
# Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities
|
||||
self.posteriorEmbedder = MetaClassifier(
|
||||
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs)
|
||||
|
||||
def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1):
|
||||
print('### Gated Recurrent Unit View Generator (G)')
|
||||
if not self.is_trained:
|
||||
# Batchify input
|
||||
self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed)
|
||||
l_train_index, l_train_target = self.multilingual_index.l_train()
|
||||
l_val_index, l_val_target = self.multilingual_index.l_val()
|
||||
l_test_index = self.multilingual_index.l_test_index()
|
||||
batcher_train = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
|
||||
lpad=self.multilingual_index.l_pad())
|
||||
batcher_eval = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
|
||||
lpad=self.multilingual_index.l_pad())
|
||||
|
||||
# Train loop
|
||||
print('Start training')
|
||||
method_name = 'gru_view_generator'
|
||||
logfile = init_logfile_nn(method_name, self.options)
|
||||
tinit = time.time()
|
||||
for epoch in range(1, nepochs + 1):
|
||||
train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target,
|
||||
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
|
||||
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
|
||||
ltrain_bert=None)
|
||||
self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
|
||||
|
||||
# validation step
|
||||
macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch,
|
||||
logfile, self.criterion, 'va')
|
||||
|
||||
self.early_stop(macrof1, epoch)
|
||||
if self.test_each > 0:
|
||||
test_gru(self.model, batcher_eval, l_test_index, None, None, self.lyte, tinit, epoch,
|
||||
logfile, self.criterion, 'te')
|
||||
|
||||
if self.early_stop.STOP:
|
||||
print('[early-stop] STOP')
|
||||
print('Restoring best model...')
|
||||
break
|
||||
|
||||
self.model = self.early_stop.restore_checkpoint()
|
||||
print(f'running last {val_epochs} training epochs on the validation set')
|
||||
for val_epoch in range(1, val_epochs+1):
|
||||
batcher_train.init_offset()
|
||||
train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target,
|
||||
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
|
||||
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
|
||||
ltrain_bert=None)
|
||||
self.is_trained = True
|
||||
|
||||
# Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities
|
||||
lX = self._get_doc_embeddings(lX)
|
||||
# Fit a ''multi-lingual'' SVM on the generated doc embeddings
|
||||
self.posteriorEmbedder.fit(lX, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX, batch_size=64):
|
||||
lX = self._get_doc_embeddings(lX)
|
||||
return self.posteriorEmbedder.predict_proba(lX)
|
||||
|
||||
def fit_transform(self, lX, ly, lV=None):
|
||||
# TODO
|
||||
return 0
|
||||
|
||||
def _get_doc_embeddings(self, lX, batch_size=64):
|
||||
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
|
||||
print('Generating document embeddings via GRU')
|
||||
lX = {}
|
||||
ly = {}
|
||||
batcher_transform = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
|
||||
lpad=self.multilingual_index.l_pad())
|
||||
|
||||
l_devel_index = self.multilingual_index.l_devel_index()
|
||||
l_devel_target = self.multilingual_index.l_devel_target()
|
||||
|
||||
for idx, (batch, post, bert_emb, target, lang) in enumerate(
|
||||
batcher_transform.batchify(l_devel_index, None, None, l_devel_target)):
|
||||
if lang not in lX.keys():
|
||||
lX[lang] = self.model.get_embeddings(batch, lang)
|
||||
ly[lang] = target.cpu().detach().numpy()
|
||||
else:
|
||||
lX[lang] = np.concatenate((lX[lang], self.model.get_embeddings(batch, lang)), axis=0)
|
||||
ly[lang] = np.concatenate((ly[lang], target.cpu().detach().numpy()), axis=0)
|
||||
|
||||
return lX
|
||||
|
||||
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
|
||||
def _load_pretrained_embeddings(self, we_path, langs):
|
||||
lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ?
|
||||
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
|
||||
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
|
||||
return lpretrained, lpretrained_vocabulary
|
||||
|
||||
def _none_dict(self, langs):
|
||||
return {l:None for l in langs}
|
||||
|
||||
# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested
|
||||
def _init_Net(self, xavier_uniform=True):
|
||||
model = RNNMultilingualClassifier(
|
||||
output_size=self.nC,
|
||||
hidden_size=self.hidden_size,
|
||||
lvocab_size=self.multilingual_index.l_vocabsize(),
|
||||
learnable_length=0,
|
||||
lpretrained=self.multilingual_index.l_embeddings(),
|
||||
drop_embedding_range=self.multilingual_index.sup_range,
|
||||
drop_embedding_prop=self.sup_drop,
|
||||
post_probabilities=self.posteriors
|
||||
)
|
||||
return model.cuda()
|
||||
|
||||
|
||||
class DocEmbedderList:
|
||||
|
||||
def __init__(self, *embedder_list, aggregation='concat'):
|
||||
assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"'
|
||||
if len(embedder_list)==0: embedder_list=[]
|
||||
if len(embedder_list) == 0:
|
||||
embedder_list = []
|
||||
self.embedders = embedder_list
|
||||
self.aggregation = aggregation
|
||||
print(f'Aggregation mode: {self.aggregation}')
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
def fit(self, lX, ly, lV=None, tfidf=None):
|
||||
for transformer in self.embedders:
|
||||
transformer.fit(lX,ly,lV)
|
||||
_lX = lX
|
||||
if transformer.requires_tfidf:
|
||||
_lX = tfidf
|
||||
transformer.fit(_lX, ly, lV)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
def transform(self, lX, tfidf=None):
|
||||
if self.aggregation == 'concat':
|
||||
return self.transform_concat(lX)
|
||||
return self.transform_concat(lX, tfidf)
|
||||
elif self.aggregation == 'mean':
|
||||
return self.transform_mean(lX)
|
||||
return self.transform_mean(lX, tfidf)
|
||||
|
||||
def transform_concat(self, lX):
|
||||
def transform_concat(self, lX, tfidf):
|
||||
if len(self.embedders) == 1:
|
||||
if self.embedders[0].requires_tfidf:
|
||||
lX = tfidf
|
||||
return self.embedders[0].transform(lX)
|
||||
|
||||
some_sparse = False
|
||||
|
|
@ -234,7 +473,10 @@ class DocEmbedderList:
|
|||
|
||||
lZparts = {l: [] for l in langs}
|
||||
for transformer in self.embedders:
|
||||
lZ = transformer.transform(lX)
|
||||
_lX = lX
|
||||
if transformer.requires_tfidf:
|
||||
_lX = tfidf
|
||||
lZ = transformer.transform(_lX)
|
||||
for l in langs:
|
||||
Z = lZ[l]
|
||||
some_sparse = some_sparse or issparse(Z)
|
||||
|
|
@ -243,22 +485,28 @@ class DocEmbedderList:
|
|||
hstacker = hstack if some_sparse else np.hstack
|
||||
return {l: hstacker(lZparts[l]) for l in langs}
|
||||
|
||||
def transform_mean(self, lX):
|
||||
def transform_mean(self, lX, tfidf):
|
||||
if len(self.embedders) == 1:
|
||||
return self.embedders[0].transform(lX)
|
||||
|
||||
langs = sorted(lX.keys())
|
||||
|
||||
lZparts = {l: None for l in langs}
|
||||
|
||||
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
|
||||
min_dim = 300
|
||||
min_dim = 73 # TODO <---- this should be the number of target classes
|
||||
|
||||
for transformer in self.embedders:
|
||||
lZ = transformer.transform(lX)
|
||||
_lX = lX
|
||||
if transformer.requires_tfidf:
|
||||
_lX = tfidf
|
||||
lZ = transformer.transform(_lX)
|
||||
nC = min([lZ[lang].shape[1] for lang in langs])
|
||||
for l in langs:
|
||||
Z = lZ[l]
|
||||
if Z.shape[1] > min_dim:
|
||||
print(f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
|
||||
print(
|
||||
f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
|
||||
f'Applying PCA(n_components={min_dim})')
|
||||
pca = PCA(n_components=min_dim)
|
||||
Z = pca.fit(Z).transform(Z)
|
||||
|
|
@ -268,12 +516,11 @@ class DocEmbedderList:
|
|||
lZparts[l] += Z
|
||||
|
||||
n_transformers = len(self.embedders)
|
||||
nC = min([lZparts[lang].shape[1] for lang in langs])
|
||||
|
||||
return {l: lZparts[l] / n_transformers for l in langs}
|
||||
|
||||
def fit_transform(self, lX, ly, lV=None):
|
||||
return self.fit(lX, ly, lV).transform(lX)
|
||||
def fit_transform(self, lX, ly, lV=None, tfidf=None):
|
||||
return self.fit(lX, ly, lV, tfidf).transform(lX, tfidf)
|
||||
|
||||
def best_params(self):
|
||||
return {'todo'}
|
||||
|
|
@ -283,11 +530,13 @@ class DocEmbedderList:
|
|||
|
||||
|
||||
class FeatureSet2Posteriors:
|
||||
def __init__(self, transformer, l2=True, n_jobs=-1):
|
||||
def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1):
|
||||
self.transformer = transformer
|
||||
self.l2 = l2
|
||||
self.n_jobs = n_jobs
|
||||
self.prob_classifier = MetaClassifier(SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
||||
self.prob_classifier = MetaClassifier(
|
||||
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
||||
self.requires_tfidf = requires_tfidf
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
if lV is None and hasattr(self.transformer, 'lV'):
|
||||
|
|
@ -314,7 +563,7 @@ class FeatureSet2Posteriors:
|
|||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Meta-Classifier
|
||||
# Meta-Classifier (aka second-tier learner)
|
||||
# ------------------------------------------------------------------
|
||||
class MetaClassifier:
|
||||
|
||||
|
|
@ -354,8 +603,9 @@ class MetaClassifier:
|
|||
def best_params(self):
|
||||
return self.model.best_params()
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Ensembling
|
||||
# Ensembling (aka Funnelling)
|
||||
# ------------------------------------------------------------------
|
||||
class Funnelling:
|
||||
def __init__(self,
|
||||
|
|
@ -368,14 +618,16 @@ class Funnelling:
|
|||
self.n_jobs = meta.n_jobs
|
||||
|
||||
def fit(self, lX, ly):
|
||||
lX = self.vectorizer.fit_transform(lX, ly)
|
||||
tfidf_lX = self.vectorizer.fit_transform(lX, ly)
|
||||
lV = self.vectorizer.vocabulary()
|
||||
lZ = self.first_tier.fit_transform(lX, ly, lV)
|
||||
print('## Fitting first-tier learners!')
|
||||
lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX)
|
||||
print('## Fitting meta-learner!')
|
||||
self.meta.fit(lZ, ly)
|
||||
|
||||
def predict(self, lX, ly=None):
|
||||
lX = self.vectorizer.transform(lX)
|
||||
lZ = self.first_tier.transform(lX)
|
||||
tfidf_lX = self.vectorizer.transform(lX)
|
||||
lZ = self.first_tier.transform(lX, tfidf=tfidf_lX)
|
||||
ly_ = self.meta.predict(lZ)
|
||||
return ly_
|
||||
|
||||
|
|
@ -394,7 +646,6 @@ class Voting:
|
|||
classifier.fit(lX, ly, lV)
|
||||
|
||||
def predict(self, lX, ly=None):
|
||||
|
||||
lP = {l: [] for l in lX.keys()}
|
||||
for classifier in self.prob_classifiers:
|
||||
lPi = classifier.predict_proba(lX)
|
||||
|
|
@ -419,7 +670,6 @@ def load_muse_embeddings(we_path, langs, n_jobs=-1):
|
|||
|
||||
|
||||
def word_class_embedding_matrix(X, Y, max_label_space=300):
|
||||
print('computing supervised embeddings...')
|
||||
WCE = supervised_embeddings_tfidf(X, Y)
|
||||
WCE = zscores(WCE, axis=0)
|
||||
|
||||
|
|
@ -434,8 +684,6 @@ def word_class_embedding_matrix(X, Y, max_label_space=300):
|
|||
|
||||
|
||||
def XdotM(X, M, sif):
|
||||
# return X.dot(M)
|
||||
print(f'X={X.shape}, M={M.shape}')
|
||||
E = X.dot(M)
|
||||
if sif:
|
||||
print("removing pc...")
|
||||
|
|
@ -447,3 +695,134 @@ def _normalize(lX, l2=True):
|
|||
return {l: normalize(X) for l, X in lX.items()} if l2 else lX
|
||||
|
||||
|
||||
class BatchGRU:
|
||||
def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
|
||||
self.batchsize = batchsize
|
||||
self.batches_per_epoch = batches_per_epoch
|
||||
self.languages = languages
|
||||
self.lpad=lpad
|
||||
self.max_pad_length=max_pad_length
|
||||
self.init_offset()
|
||||
|
||||
def init_offset(self):
|
||||
self.offset = {lang: 0 for lang in self.languages}
|
||||
|
||||
def batchify(self, l_index, l_post, l_bert, llabels):
|
||||
langs = self.languages
|
||||
l_num_samples = {l:len(l_index[l]) for l in langs}
|
||||
|
||||
max_samples = max(l_num_samples.values())
|
||||
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
|
||||
if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches:
|
||||
n_batches = self.batches_per_epoch
|
||||
|
||||
for b in range(n_batches):
|
||||
for lang in langs:
|
||||
index, labels = l_index[lang], llabels[lang]
|
||||
offset = self.offset[lang]
|
||||
if offset >= l_num_samples[lang]:
|
||||
offset = 0
|
||||
limit = offset+self.batchsize
|
||||
|
||||
batch_slice = slice(offset, limit)
|
||||
batch = index[batch_slice]
|
||||
batch_labels = labels[batch_slice].toarray()
|
||||
|
||||
post = None
|
||||
bert_emb = None
|
||||
|
||||
batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
|
||||
batch = torch.LongTensor(batch).cuda()
|
||||
target = torch.FloatTensor(batch_labels).cuda()
|
||||
|
||||
self.offset[lang] = limit
|
||||
|
||||
yield batch, post, bert_emb, target, lang
|
||||
|
||||
|
||||
def pad(index_list, pad_index, max_pad_length=None):
|
||||
pad_length = np.max([len(index) for index in index_list])
|
||||
if max_pad_length is not None:
|
||||
pad_length = min(pad_length, max_pad_length)
|
||||
for i,indexes in enumerate(index_list):
|
||||
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
|
||||
return index_list
|
||||
|
||||
|
||||
def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, optim, epoch, method_name, opt,
|
||||
ltrain_posteriors=None, ltrain_bert=None, log_interval=10):
|
||||
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
loss_history = []
|
||||
model.train()
|
||||
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
|
||||
optim.zero_grad()
|
||||
loss = criterion(model(batch, post, bert_emb, lang), target)
|
||||
loss.backward()
|
||||
clip_gradient(model)
|
||||
optim.step()
|
||||
loss_history.append(loss.item())
|
||||
|
||||
if idx % log_interval == 0:
|
||||
interval_loss = np.mean(loss_history[-log_interval:])
|
||||
print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, '
|
||||
f'Training Loss: {interval_loss:.6f}')
|
||||
|
||||
mean_loss = np.mean(interval_loss)
|
||||
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time.time() - tinit)
|
||||
return mean_loss
|
||||
|
||||
|
||||
def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix):
|
||||
loss_history = []
|
||||
model.eval()
|
||||
langs = sorted(ltest_index.keys())
|
||||
predictions = {l: [] for l in langs}
|
||||
yte_stacked = {l: [] for l in langs}
|
||||
batcher.init_offset()
|
||||
for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte),
|
||||
desc='evaluation: '):
|
||||
logits = model(batch, post, bert_emb, lang)
|
||||
loss = criterion(logits, target).item()
|
||||
prediction = predict(logits)
|
||||
predictions[lang].append(prediction)
|
||||
yte_stacked[lang].append(target.detach().cpu().numpy())
|
||||
loss_history.append(loss)
|
||||
|
||||
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
|
||||
ly_ = {l:np.vstack(predictions[l]) for l in langs}
|
||||
l_eval = evaluate(ly, ly_)
|
||||
metrics = []
|
||||
for lang in langs:
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
if measure_prefix == 'te':
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
||||
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
||||
|
||||
mean_loss = np.mean(loss_history)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time.time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time.time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time.time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time.time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time.time() - tinit)
|
||||
|
||||
return Mf1
|
||||
|
||||
|
||||
def clip_gradient(model, clip_value=1e-1):
|
||||
params = list(filter(lambda p: p.grad is not None, model.parameters()))
|
||||
for p in params:
|
||||
p.grad.data.clamp_(-clip_value, clip_value)
|
||||
|
||||
|
||||
def init_logfile_nn(method_name, opt):
|
||||
logfile = CSVLog(opt.logfile_gru, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
|
||||
logfile.set_default('dataset', opt.dataset)
|
||||
logfile.set_default('run', opt.seed)
|
||||
logfile.set_default('method', method_name)
|
||||
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
|
||||
f'and run {opt.seed} already calculated'
|
||||
return logfile
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ from time import time
|
|||
from tqdm import tqdm
|
||||
from util.evaluation import evaluate
|
||||
from util.file import get_file_name
|
||||
# import pickle
|
||||
|
||||
allowed_nets = {'rnn'}
|
||||
|
||||
|
|
@ -34,7 +35,8 @@ def init_Net(nC, multilingual_index, xavier_uniform=True):
|
|||
drop_embedding_range=multilingual_index.sup_range,
|
||||
drop_embedding_prop=opt.sup_drop,
|
||||
post_probabilities=opt.posteriors,
|
||||
only_post=only_post
|
||||
only_post=only_post,
|
||||
bert_embeddings=opt.mbert
|
||||
)
|
||||
|
||||
# weight initialization
|
||||
|
|
@ -59,6 +61,8 @@ def set_method_name():
|
|||
method_name += f'-WCE'
|
||||
if opt.posteriors:
|
||||
method_name += f'-Posteriors'
|
||||
if opt.mbert:
|
||||
method_name += f'-mBert'
|
||||
if (opt.pretrained or opt.supervised) and opt.tunable:
|
||||
method_name += '-(trainable)'
|
||||
else:
|
||||
|
|
@ -77,7 +81,8 @@ def init_logfile(method_name, opt):
|
|||
logfile.set_default('dataset', opt.dataset)
|
||||
logfile.set_default('run', opt.seed)
|
||||
logfile.set_default('method', method_name)
|
||||
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated'
|
||||
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
|
||||
f'and run {opt.seed} already calculated'
|
||||
return logfile
|
||||
|
||||
|
||||
|
|
@ -90,124 +95,21 @@ def load_pretrained_embeddings(we_path, langs):
|
|||
return lpretrained, lpretrained_vocabulary
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def main():
|
||||
|
||||
method_name = set_method_name()
|
||||
logfile = init_logfile(method_name, opt)
|
||||
|
||||
# Loading the dataset
|
||||
data = MultilingualDataset.load(opt.dataset)
|
||||
data.set_view(languages=['de', 'fr']) #, 'it', 'en']) # 'sv', 'da', 'es', 'it'])
|
||||
data.show_dimensions()
|
||||
langs = data.langs()
|
||||
l_devel_raw, l_devel_target = data.training(target_as_csr=True)
|
||||
l_test_raw, l_test_target = data.test(target_as_csr=True)
|
||||
|
||||
# Loading the MUSE pretrained embeddings (only if requested)
|
||||
lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
|
||||
# lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set
|
||||
|
||||
# Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
|
||||
multilingual_index = MultilingualIndex()
|
||||
multilingual_index.index(l_devel_raw, l_devel_target, l_test_raw, lpretrained_vocabulary)
|
||||
multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
|
||||
multilingual_index.embedding_matrices(lpretrained, opt.supervised)
|
||||
if opt.posteriors:
|
||||
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000, store_posteriors=True) #stored_post=True) #opt.svm_max_docs)
|
||||
else:
|
||||
lPtr, lPva, lPte = None, None, None
|
||||
|
||||
# just_test = False
|
||||
# if just_test:
|
||||
#
|
||||
# model = torch.load(
|
||||
# '../checkpoint/rnn(H512)-Muse-WCE-Posteriors-(trainable)-jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')
|
||||
# criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||
#
|
||||
# # batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
|
||||
#
|
||||
# batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
|
||||
# l_test_index = multilingual_index.l_test_index()
|
||||
# epoch = 1
|
||||
# tinit = time()
|
||||
# test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
|
||||
# exit('Loaded')
|
||||
|
||||
# Model initialization
|
||||
model = init_Net(data.num_categories(), multilingual_index)
|
||||
|
||||
optim = init_optimizer(model, lr=opt.lr)
|
||||
criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||
lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
|
||||
batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
|
||||
batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
|
||||
|
||||
tinit = time()
|
||||
create_if_not_exist(opt.checkpoint_dir)
|
||||
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
|
||||
|
||||
l_train_index, l_train_target = multilingual_index.l_train()
|
||||
l_val_index, l_val_target = multilingual_index.l_val()
|
||||
l_test_index = multilingual_index.l_test_index()
|
||||
|
||||
print('-'*80)
|
||||
print('Start training')
|
||||
for epoch in range(1, opt.nepochs + 1):
|
||||
train(model, batcher_train, l_train_index, lPtr, l_train_target, tinit, logfile, criterion, optim, epoch, method_name)
|
||||
lr_scheduler.step() # reduces the learning rate
|
||||
|
||||
# validation
|
||||
macrof1 = test(model, batcher_eval, l_val_index, lPva, l_val_target, tinit, epoch, logfile, criterion, 'va')
|
||||
early_stop(macrof1, epoch)
|
||||
if opt.test_each>0:
|
||||
if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
|
||||
test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
|
||||
|
||||
if early_stop.STOP:
|
||||
print('[early-stop] STOP')
|
||||
if not opt.plotmode: # with plotmode activated, early-stop is ignored
|
||||
break
|
||||
|
||||
# training is over
|
||||
# restores the best model according to the Mf1 of the validation set (only when plotmode==False)
|
||||
# stoptime = early_stop.stop_time - tinit
|
||||
# stopepoch = early_stop.best_epoch
|
||||
# logfile.add_row(epoch=stopepoch, measure=f'early-stop', value=early_stop.best_score, timelapse=stoptime)
|
||||
|
||||
if opt.plotmode==False:
|
||||
print('-' * 80)
|
||||
print('Training over. Performing final evaluation')
|
||||
|
||||
# torch.cuda.empty_cache()
|
||||
model = early_stop.restore_checkpoint()
|
||||
|
||||
if opt.val_epochs>0:
|
||||
print(f'running last {opt.val_epochs} training epochs on the validation set')
|
||||
for val_epoch in range(1, opt.val_epochs + 1):
|
||||
batcher_train.init_offset()
|
||||
train(model, batcher_train, l_val_index, lPva, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name)
|
||||
|
||||
# final test
|
||||
print('Training complete: testing')
|
||||
test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
|
||||
|
||||
|
||||
def get_lr(optimizer):
|
||||
for param_group in optimizer.param_groups:
|
||||
return param_group['lr']
|
||||
|
||||
|
||||
def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name):
|
||||
def train(model, batcher, ltrain_index, ltrain_posteriors, ltrain_bert, lytr, tinit, logfile, criterion, optim, epoch, method_name):
|
||||
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
loss_history = []
|
||||
model.train()
|
||||
for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)):
|
||||
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
|
||||
optim.zero_grad()
|
||||
_out = model(batch,post, lang)
|
||||
loss = criterion(model(batch, post, lang), target)
|
||||
# _out = model(batch, post, bert_emb, lang)
|
||||
loss = criterion(model(batch, post, bert_emb, lang), target)
|
||||
loss.backward()
|
||||
clip_gradient(model)
|
||||
optim.step()
|
||||
|
|
@ -222,7 +124,7 @@ def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile,
|
|||
return mean_loss
|
||||
|
||||
|
||||
def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix):
|
||||
def test(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix):
|
||||
|
||||
loss_history = []
|
||||
model.eval()
|
||||
|
|
@ -230,8 +132,8 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
|
|||
predictions = {l:[] for l in langs}
|
||||
yte_stacked = {l:[] for l in langs}
|
||||
batcher.init_offset()
|
||||
for batch, post, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lyte), desc='evaluation: '):
|
||||
logits = model(batch, post, lang)
|
||||
for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), desc='evaluation: '):
|
||||
logits = model(batch, post, bert_emb, lang)
|
||||
loss = criterion(logits, target).item()
|
||||
prediction = predict(logits)
|
||||
predictions[lang].append(prediction)
|
||||
|
|
@ -260,6 +162,122 @@ def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logf
|
|||
return Mf1
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def main():
|
||||
DEBUGGING = False
|
||||
|
||||
method_name = set_method_name()
|
||||
logfile = init_logfile(method_name, opt)
|
||||
|
||||
# Loading the dataset
|
||||
data = MultilingualDataset.load(opt.dataset)
|
||||
# data.set_view(languages=['it', 'fr']) # Testing with less langs
|
||||
data.show_dimensions()
|
||||
langs = data.langs()
|
||||
l_devel_raw, l_devel_target = data.training(target_as_csr=True)
|
||||
l_test_raw, l_test_target = data.test(target_as_csr=True)
|
||||
|
||||
# Loading the MUSE pretrained embeddings (only if requested)
|
||||
lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
|
||||
# lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set
|
||||
|
||||
# Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
|
||||
multilingual_index = MultilingualIndex()
|
||||
multilingual_index.index(l_devel_raw, l_devel_target, l_test_raw, lpretrained_vocabulary)
|
||||
multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
|
||||
multilingual_index.embedding_matrices(lpretrained, opt.supervised)
|
||||
if opt.posteriors:
|
||||
if DEBUGGING:
|
||||
import pickle
|
||||
with open('/home/andreapdr/funneling_pdr/dumps/posteriors_jrc_run0.pickle', 'rb') as infile:
|
||||
data_post = pickle.load(infile)
|
||||
lPtr = data_post[0]
|
||||
lPva = data_post[1]
|
||||
lPte = data_post[2]
|
||||
print('## DEBUGGING MODE: loaded dumped posteriors for jrc run0')
|
||||
else:
|
||||
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000)
|
||||
else:
|
||||
lPtr, lPva, lPte = None, None, None
|
||||
|
||||
if opt.mbert:
|
||||
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
_model_folder = _dataset_path[0] + '_' + _dataset_path[-1].replace('.pickle', '')
|
||||
# print(f'Model Folder: {_model_folder}')
|
||||
|
||||
if DEBUGGING:
|
||||
with open('/home/andreapdr/funneling_pdr/dumps/mBert_jrc_run0.pickle', 'rb') as infile:
|
||||
data_embed = pickle.load(infile)
|
||||
tr_bert_embeddings = data_embed[0]
|
||||
va_bert_embeddings = data_embed[1]
|
||||
te_bert_embeddings = data_embed[2]
|
||||
print('## DEBUGGING MODE: loaded dumped mBert embeddings for jrc run0')
|
||||
else:
|
||||
tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings \
|
||||
= multilingual_index.bert_embeddings(f'/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-{_model_folder}/')
|
||||
else:
|
||||
tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings = None, None, None
|
||||
|
||||
# Model initialization
|
||||
model = init_Net(data.num_categories(), multilingual_index)
|
||||
|
||||
optim = init_optimizer(model, lr=opt.lr)
|
||||
criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||
lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
|
||||
batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
|
||||
batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
|
||||
|
||||
tinit = time()
|
||||
create_if_not_exist(opt.checkpoint_dir)
|
||||
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
|
||||
checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
|
||||
|
||||
l_train_index, l_train_target = multilingual_index.l_train()
|
||||
l_val_index, l_val_target = multilingual_index.l_val()
|
||||
l_test_index = multilingual_index.l_test_index()
|
||||
|
||||
print('-'*80)
|
||||
print('Start training')
|
||||
for epoch in range(1, opt.nepochs + 1):
|
||||
train(model, batcher_train, l_train_index, lPtr, tr_bert_embeddings, l_train_target, tinit, logfile, criterion, optim, epoch, method_name)
|
||||
lr_scheduler.step() # reduces the learning rate
|
||||
|
||||
# validation
|
||||
macrof1 = test(model, batcher_eval, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, epoch, logfile, criterion, 'va')
|
||||
early_stop(macrof1, epoch)
|
||||
if opt.test_each>0:
|
||||
if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
|
||||
test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
|
||||
|
||||
if early_stop.STOP:
|
||||
print('[early-stop] STOP')
|
||||
if not opt.plotmode: # with plotmode activated, early-stop is ignored
|
||||
break
|
||||
|
||||
# training is over
|
||||
# restores the best model according to the Mf1 of the validation set (only when plotmode==False)
|
||||
# stoptime = early_stop.stop_time - tinit
|
||||
# stopepoch = early_stop.best_epoch
|
||||
# logfile.add_row(epoch=stopepoch, measure=f'early-stop', value=early_stop.best_score, timelapse=stoptime)
|
||||
|
||||
if opt.plotmode==False:
|
||||
print('-' * 80)
|
||||
print('Training over. Performing final evaluation')
|
||||
|
||||
# torch.cuda.empty_cache()
|
||||
model = early_stop.restore_checkpoint()
|
||||
|
||||
if opt.val_epochs>0:
|
||||
print(f'running last {opt.val_epochs} training epochs on the validation set')
|
||||
for val_epoch in range(1, opt.val_epochs + 1):
|
||||
batcher_train.init_offset()
|
||||
train(model, batcher_train, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name)
|
||||
|
||||
# final test
|
||||
print('Training complete: testing')
|
||||
test(model, batcher_eval, l_test_index, lPte, te_bert_embeddings, l_test_target, tinit, epoch, logfile, criterion, 'te')
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
if __name__ == '__main__':
|
||||
|
||||
|
|
@ -281,8 +299,6 @@ if __name__ == '__main__':
|
|||
'language used to train the calibrated SVMs (only used if --posteriors is active)')
|
||||
parser.add_argument('--log-interval', type=int, default=10, metavar='int', help='how many batches to wait before printing training status')
|
||||
parser.add_argument('--log-file', type=str, default='../log/log.csv', metavar='str', help='path to the log csv file')
|
||||
# parser.add_argument('--pickle-dir', type=str, default='../pickles', metavar='str', help=f'if set, specifies the path where to '
|
||||
# f'save/load the dataset pickled (set to None if you prefer not to retain the pickle file)')
|
||||
parser.add_argument('--test-each', type=int, default=0, metavar='int', help='how many epochs to wait before invoking test (default: 0, only at the end)')
|
||||
parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', help='path to the directory containing checkpoints')
|
||||
parser.add_argument('--net', type=str, default='rnn', metavar='str', help=f'net, one in {allowed_nets}')
|
||||
|
|
@ -299,7 +315,9 @@ if __name__ == '__main__':
|
|||
'(default 300)')
|
||||
parser.add_argument('--force', action='store_true', default=False, help='do not check if this experiment has already been run')
|
||||
parser.add_argument('--tunable', action='store_true', default=False,
|
||||
help='pretrained embeddings are tunable from the begining (default False, i.e., static)')
|
||||
help='pretrained embeddings are tunable from the beginning (default False, i.e., static)')
|
||||
parser.add_argument('--mbert', action='store_true', default=False,
|
||||
help='use mBert embeddings')
|
||||
|
||||
opt = parser.parse_args()
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ from transformers import BertTokenizer, BertForSequenceClassification, AdamW
|
|||
from torch.utils.data import Dataset, DataLoader
|
||||
import numpy as np
|
||||
import torch
|
||||
from util.common import clip_gradient, predict
|
||||
from util.common import predict
|
||||
from time import time
|
||||
from util.csv_log import CSVLog
|
||||
from util.evaluation import evaluate
|
||||
|
|
@ -12,6 +12,7 @@ from torch.optim.lr_scheduler import StepLR
|
|||
from sklearn.model_selection import train_test_split
|
||||
from copy import deepcopy
|
||||
import argparse
|
||||
# from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
|
||||
def check_sentences(sentences):
|
||||
|
|
@ -69,11 +70,14 @@ def get_dataset_name(datapath):
|
|||
if id_split in possible_splits:
|
||||
dataset_name = splitted[0].split('/')[-1]
|
||||
return f'{dataset_name}_run{id_split}'
|
||||
elif splitted[-2].split('.')[0] == 'full':
|
||||
dataset_name = splitted[0].split('/')[-1]
|
||||
return f'{dataset_name}_fullrun'
|
||||
|
||||
|
||||
def load_datasets(datapath):
|
||||
data = MultilingualDataset.load(datapath)
|
||||
# data.set_view(languages=['it'], categories=[0, 1, 2, 3, 4]) # Testing with less langs
|
||||
# data.set_view(languages=['it']) #, categories=[0, 1, 2, 3, 4]) # Testing with less langs
|
||||
data.show_dimensions()
|
||||
|
||||
l_devel_raw, l_devel_target = data.training(target_as_csr=False)
|
||||
|
|
@ -82,7 +86,8 @@ def load_datasets(datapath):
|
|||
return l_devel_raw, l_devel_target, l_test_raw, l_test_target
|
||||
|
||||
|
||||
def do_tokenization(l_dataset, max_len=512):
|
||||
def do_tokenization(l_dataset, max_len=512, verbose=True):
|
||||
if verbose:
|
||||
print('# Starting Tokenization ...')
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
||||
langs = l_dataset.keys()
|
||||
|
|
@ -91,7 +96,6 @@ def do_tokenization(l_dataset, max_len=512):
|
|||
l_tokenized[lang] = tokenizer(l_dataset[lang],
|
||||
truncation=True,
|
||||
max_length=max_len,
|
||||
# add_special_tokens=True,
|
||||
padding='max_length')
|
||||
return l_tokenized
|
||||
|
||||
|
|
@ -162,7 +166,7 @@ def check_param_grad_status(model):
|
|||
print('#' * 50)
|
||||
|
||||
|
||||
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, val_step=False, val_dataloader=None, lang_ids=None):
|
||||
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer):
|
||||
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
|
|
@ -179,6 +183,10 @@ def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit,
|
|||
optim.step()
|
||||
loss_history.append(loss.item())
|
||||
|
||||
if writer is not None:
|
||||
_n_step = (epoch - 1) * (len(train_dataloader)) + idx
|
||||
writer.add_scalar('Loss_step/Train', loss, _n_step)
|
||||
|
||||
# Check tokenized sentences consistency
|
||||
# check_sentences(batch.cpu())
|
||||
|
||||
|
|
@ -187,16 +195,12 @@ def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit,
|
|||
print(
|
||||
f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
|
||||
|
||||
# if val_step and idx % 100 == 0:
|
||||
# macrof1 = test(model, val_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
|
||||
# early_stop
|
||||
|
||||
mean_loss = np.mean(interval_loss)
|
||||
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
|
||||
return mean_loss
|
||||
|
||||
|
||||
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix):
|
||||
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix, writer):
|
||||
print('# Validating model ...')
|
||||
loss_history = []
|
||||
model.eval()
|
||||
|
|
@ -229,6 +233,8 @@ def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, mea
|
|||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
||||
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
||||
if writer is not None:
|
||||
writer.add_scalars('Eval Metrics', {'Mf1': Mf1, 'mF1': mF1, 'MK': MK, 'mk':mk}, epoch)
|
||||
|
||||
mean_loss = np.mean(loss_history)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
|
||||
|
|
@ -281,6 +287,7 @@ def main():
|
|||
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
|
||||
te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False)
|
||||
|
||||
|
||||
# Initializing model
|
||||
nC = tr_dataset.get_nclasses()
|
||||
model = get_model(nC)
|
||||
|
|
@ -289,29 +296,31 @@ def main():
|
|||
optim = init_optimizer(model, lr=opt.lr)
|
||||
lr_scheduler = StepLR(optim, step_size=25, gamma=0.1)
|
||||
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
|
||||
checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_dataset_name(opt.dataset)}')
|
||||
# lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optim, num_warmup_steps= , num_training_steps=)
|
||||
# print(model)
|
||||
checkpoint=f'/home/andreapdr/funneling_pdr/hug_checkpoint/{method_name}-{get_dataset_name(opt.dataset)}',
|
||||
is_bert=True)
|
||||
|
||||
# Freezing encoder
|
||||
# model = freeze_encoder(model)
|
||||
check_param_grad_status(model)
|
||||
|
||||
# Tensorboard logger
|
||||
# writer = SummaryWriter('../log/tensorboard_logs/')
|
||||
|
||||
# Training loop
|
||||
tinit = time()
|
||||
lang_ids = va_dataset.lang_ids
|
||||
for epoch in range(1, opt.nepochs + 1):
|
||||
print('# Start Training ...')
|
||||
train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile)
|
||||
train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer=None)
|
||||
lr_scheduler.step() # reduces the learning rate
|
||||
|
||||
# Validation
|
||||
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
|
||||
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va', writer=None)
|
||||
early_stop(macrof1, epoch)
|
||||
if opt.test_each > 0:
|
||||
if (opt.plotmode and (epoch == 1 or epoch % opt.test_each == 0)) or (
|
||||
not opt.plotmode and epoch % opt.test_each == 0 and epoch < opt.nepochs):
|
||||
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
|
||||
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None)
|
||||
|
||||
if early_stop.STOP:
|
||||
print('[early-stop] STOP')
|
||||
|
|
@ -323,16 +332,19 @@ def main():
|
|||
print('Training over. Performing final evaluation')
|
||||
|
||||
model = early_stop.restore_checkpoint()
|
||||
model = model.cuda()
|
||||
|
||||
if opt.val_epochs > 0:
|
||||
print(f'running last {opt.val_epochs} training epochs on the validation set')
|
||||
for val_epoch in range(1, opt.val_epochs + 1):
|
||||
train(model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile)
|
||||
train(model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile, writer=None)
|
||||
|
||||
# final test
|
||||
print('Training complete: testing')
|
||||
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te')
|
||||
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None)
|
||||
|
||||
# writer.flush()
|
||||
# writer.close()
|
||||
exit('Code Executed!')
|
||||
|
||||
|
||||
|
|
@ -372,6 +384,7 @@ if __name__ == '__main__':
|
|||
# Testing different parameters ...
|
||||
opt.weight_decay = 0.01
|
||||
opt.lr = 1e-5
|
||||
opt.patience = 5
|
||||
|
||||
main()
|
||||
# TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
from main_mbert import *
|
||||
import pickle
|
||||
|
||||
|
||||
class ExtractorDataset(Dataset):
|
||||
"""
|
||||
data: dict of lang specific tokenized data
|
||||
labels: dict of lang specific targets
|
||||
"""
|
||||
|
||||
def __init__(self, data):
|
||||
self.langs = data.keys()
|
||||
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
|
||||
|
||||
for i, lang in enumerate(self.langs):
|
||||
_data = data[lang]['input_ids']
|
||||
_data = np.array(_data)
|
||||
_lang_value = np.full(len(_data), self.lang_ids[lang])
|
||||
|
||||
if i == 0:
|
||||
self.data = _data
|
||||
self.lang_index = _lang_value
|
||||
else:
|
||||
self.data = np.vstack((self.data, _data))
|
||||
self.lang_index = np.concatenate((self.lang_index, _lang_value))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
x = self.data[idx]
|
||||
lang = self.lang_index[idx]
|
||||
|
||||
return x, lang
|
||||
|
||||
def get_lang_ids(self):
|
||||
return self.lang_ids
|
||||
|
||||
|
||||
def feature_extractor(data, lang_ids, model_path='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0/'):
|
||||
print('# Feature Extractor Mode...')
|
||||
from transformers import BertConfig
|
||||
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, num_labels=300)
|
||||
model = BertForSequenceClassification.from_pretrained(model_path,
|
||||
config=config).cuda()
|
||||
|
||||
"""
|
||||
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
|
||||
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
|
||||
"""
|
||||
all_batch_embeddings = {}
|
||||
id2lang = {v:k for k,v in lang_ids.items()}
|
||||
with torch.no_grad():
|
||||
for batch, target, lang_idx in data:
|
||||
out = model(batch.cuda())
|
||||
last_hidden_state = out[1][-1]
|
||||
batch_embeddings = last_hidden_state[:, 0, :]
|
||||
for i, l_idx in enumerate(lang_idx.numpy()):
|
||||
if id2lang[l_idx] not in all_batch_embeddings.keys():
|
||||
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
|
||||
else:
|
||||
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
|
||||
batch_embeddings[i].detach().cpu().numpy()))
|
||||
|
||||
return all_batch_embeddings, id2lang
|
||||
|
||||
|
||||
def main():
|
||||
print('Running main ...')
|
||||
print(f'Model path: {opt.modelpath}\nDataset path: {opt.dataset}')
|
||||
DATAPATH = opt.dataset
|
||||
MAX_LEN = 512
|
||||
|
||||
l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
|
||||
l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN)
|
||||
l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN)
|
||||
|
||||
tr_dataset = TrainingDataset(l_tokenized_tr, l_devel_target)
|
||||
tr_lang_ids = tr_dataset.lang_ids
|
||||
|
||||
te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
|
||||
te_lang_ids = te_dataset.lang_ids
|
||||
|
||||
tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc embeddings
|
||||
te_dataloader = DataLoader(te_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc
|
||||
|
||||
tr_all_batch_embeddings, id2lang_tr = feature_extractor(tr_dataloader, tr_lang_ids, opt.modelpath) # Extracting doc embed for devel
|
||||
with open(f'{opt.modelpath}/TR_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile:
|
||||
pickle.dump((tr_all_batch_embeddings, id2lang_tr), outfile)
|
||||
|
||||
te_all_batch_embeddings, id2lang_te = feature_extractor(te_dataloader, te_lang_ids, opt.modelpath) # Extracting doc embed for test
|
||||
with open(f'{opt.modelpath}/TE_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile:
|
||||
pickle.dump((te_all_batch_embeddings, id2lang_te), outfile)
|
||||
|
||||
exit('Extraction completed!')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='mBert model document embedding extractor')
|
||||
|
||||
parser.add_argument('--dataset', type=str,
|
||||
default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
|
||||
metavar='datasetpath', help=f'path to the pickled dataset')
|
||||
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
|
||||
parser.add_argument('--modelpath', type=str, default='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0',
|
||||
metavar='modelpath', help=f'path to pre-trained mBert model')
|
||||
opt = parser.parse_args()
|
||||
|
||||
main()
|
||||
|
||||
|
|
@ -2,102 +2,41 @@ import os
|
|||
from dataset_builder import MultilingualDataset
|
||||
from learning.transformers import *
|
||||
from util.evaluation import *
|
||||
from optparse import OptionParser
|
||||
from util.file import exists
|
||||
from util.results import PolylingualClassificationResults
|
||||
from sklearn.svm import SVC
|
||||
|
||||
|
||||
parser = OptionParser(usage="usage: %prog datapath [options]")
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='multiModal_log.csv')
|
||||
|
||||
parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true',
|
||||
help="Add posterior probabilities to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
|
||||
help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
|
||||
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("--l2", dest="l2", action='store_true',
|
||||
help="Activates l2 normalization as a post-processing for the document embedding views", default=False)
|
||||
|
||||
parser.add_option("--allprob", dest="allprob", action='store_true',
|
||||
help="All views are generated as posterior probabilities. This affects the supervised and pretrained "
|
||||
"embeddings, for which a calibrated classifier is generated, which generates the posteriors", default=False)
|
||||
|
||||
parser.add_option("--feat-weight", dest="feat_weight",
|
||||
help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf')
|
||||
|
||||
parser.add_option("-w", "--we-path", dest="we_path",
|
||||
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
||||
help="Optimize hyperparameters", default=False)
|
||||
|
||||
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
||||
help="Number of parallel jobs (default is -1, all)", default=-1)
|
||||
|
||||
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
||||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
||||
default=300)
|
||||
|
||||
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
|
||||
help="Remove common component when computing dot product of word embedding matrices", default=False)
|
||||
|
||||
parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
|
||||
help="Z-score normalize matrices (WCE and MUSE)", default=False)
|
||||
|
||||
parser.add_option("-a", "--agg", dest="agg", action='store_true',
|
||||
help="Set aggregation function of the common Z-space to average (Default: concatenation)", default=False)
|
||||
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
|
||||
|
||||
def get_params():
|
||||
if not op.optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = 'rbf'
|
||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
||||
|
||||
#######################################################################################################################
|
||||
|
||||
from util.common import *
|
||||
from util.parser_options import *
|
||||
|
||||
if __name__ == '__main__':
|
||||
(op, args) = parser.parse_args()
|
||||
|
||||
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
|
||||
dataset = args[0]
|
||||
dataset = op.dataset
|
||||
assert exists(dataset), 'Unable to find file '+str(dataset)
|
||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
|
||||
assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \
|
||||
'empty set of document embeddings is not allowed'
|
||||
assert (op.gruWCE or op.gruMUSE) and op.gruViewGenerator, 'Initializing Gated Recurrent embedding layer without ' \
|
||||
'explicit initialization of GRU View Generator'
|
||||
|
||||
l2 = op.l2
|
||||
|
||||
dataset_file = os.path.basename(dataset)
|
||||
|
||||
results = PolylingualClassificationResults('../log/' + op.output)
|
||||
allprob = 'Prob' if op.allprob else ''
|
||||
result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \
|
||||
f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}_zscore={op.zscore}{"_optimC" if op.optimc else ""}'
|
||||
print(f'{result_id}')
|
||||
|
||||
# renaming arguments to be printed on log
|
||||
method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert,
|
||||
op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob)
|
||||
print(f'Method: gFun{method_name}\nDataset: {dataset_name}')
|
||||
print('-'*50)
|
||||
exit()
|
||||
|
||||
# set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect
|
||||
standardize_range = slice(0, 0)
|
||||
if op.zscore:
|
||||
standardize_range = None
|
||||
|
||||
# load dataset
|
||||
data = MultilingualDataset.load(dataset)
|
||||
# data.set_view(languages=['fr', 'it'])
|
||||
# data.set_view(languages=['fr', 'it']) # TODO: DEBUG SETTING
|
||||
data.show_dimensions()
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
|
@ -108,68 +47,96 @@ if __name__ == '__main__':
|
|||
# feature weighting (for word embeddings average)
|
||||
feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
|
||||
|
||||
# # document embedding modules
|
||||
# document embedding modules aka View Generators
|
||||
doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat')
|
||||
|
||||
# init View Generators
|
||||
if op.posteriors:
|
||||
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2))
|
||||
"""
|
||||
View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means
|
||||
of a set of SVM.
|
||||
"""
|
||||
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
|
||||
kernel='linear',
|
||||
C=op.set_c), l2=l2))
|
||||
|
||||
if op.supervised:
|
||||
"""
|
||||
View Generator (-W): generates document representation via Word-Class-Embeddings.
|
||||
Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
"""
|
||||
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||
if op.allprob:
|
||||
wce = FeatureSet2Posteriors(wce, l2=l2)
|
||||
wce = FeatureSet2Posteriors(wce, requires_tfidf=True, l2=l2)
|
||||
doc_embedder.append(wce)
|
||||
|
||||
if op.pretrained:
|
||||
"""
|
||||
View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word
|
||||
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
"""
|
||||
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
|
||||
if op.allprob:
|
||||
muse = FeatureSet2Posteriors(muse, l2=l2)
|
||||
muse = FeatureSet2Posteriors(muse, requires_tfidf=True, l2=l2)
|
||||
doc_embedder.append(muse)
|
||||
|
||||
if op.gruViewGenerator:
|
||||
"""
|
||||
View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be
|
||||
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Such
|
||||
document embeddings are then casted into vectors of posterior probabilities via a set of SVM.
|
||||
NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob
|
||||
"""
|
||||
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
|
||||
options=op, model_path=op.gru_path)
|
||||
doc_embedder.append(rnn_embedder)
|
||||
|
||||
if op.mbert:
|
||||
"""
|
||||
View generator (-B): generates document embedding via mBERT model.
|
||||
"""
|
||||
mbert = MBertEmbedder(path_to_model=op.bert_path,
|
||||
nC=data.num_categories())
|
||||
if op.allprob:
|
||||
mbert = FeatureSet2Posteriors(mbert, l2=l2)
|
||||
doc_embedder.append(mbert)
|
||||
|
||||
# metaclassifier
|
||||
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=get_params(), standardize_range=standardize_range)
|
||||
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c),
|
||||
meta_parameters=get_params(op.optimc), standardize_range=standardize_range)
|
||||
|
||||
# ensembling the modules
|
||||
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
|
||||
|
||||
print('# Fitting ...')
|
||||
print('\n# Fitting Funnelling Architecture...')
|
||||
tinit = time.time()
|
||||
classifier.fit(lXtr, lytr)
|
||||
time = time.time()-tinit
|
||||
|
||||
print('\n# Evaluating ...')
|
||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||
|
||||
# renaming arguments to be printed on log
|
||||
_id = ''
|
||||
_id_conf = [op.posteriors, op.supervised, op.pretrained]
|
||||
_id_name = ['+P', '+W', '+M']
|
||||
for i, conf in enumerate(_id_conf):
|
||||
if conf:
|
||||
_id += _id_name[i]
|
||||
_id = _id.lstrip('+')
|
||||
_id = _id if not op.agg else _id + '_mean'
|
||||
_id = _id if not op.allprob else _id + '_allprob'
|
||||
|
||||
_dataset_path = dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
|
||||
metrics = []
|
||||
for lang in lXte.keys():
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
results.add_row(method='MultiModal',
|
||||
learner='svm',
|
||||
learner='SVM',
|
||||
optimp=op.optimc,
|
||||
sif=op.sif,
|
||||
zscore=op.zscore,
|
||||
l2=op.l2,
|
||||
wescaler=op.feat_weight,
|
||||
pca=op.max_labels_S,
|
||||
id=_id,
|
||||
dataset=dataset_id,
|
||||
time='todo',
|
||||
id=method_name,
|
||||
dataset=dataset_name,
|
||||
time=time,
|
||||
lang=lang,
|
||||
macrof1=macrof1,
|
||||
microf1=microf1,
|
||||
macrok=macrok,
|
||||
microk=microk,
|
||||
notes='')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||
print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))
|
||||
|
|
|
|||
|
|
@ -0,0 +1,49 @@
|
|||
import os
|
||||
from dataset_builder import MultilingualDataset
|
||||
from optparse import OptionParser
|
||||
from util.file import exists
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
parser = OptionParser(usage="usage: %prog datapath [options]")
|
||||
|
||||
(op, args) = parser.parse_args()
|
||||
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
|
||||
dataset = args[0]
|
||||
assert exists(dataset), 'Unable to find file '+str(dataset)
|
||||
|
||||
dataset_file = os.path.basename(dataset)
|
||||
|
||||
data = MultilingualDataset.load(dataset)
|
||||
data.set_view(languages=['it'])
|
||||
data.show_dimensions()
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
vect_lXtr = dict()
|
||||
vectorizer = CountVectorizer()
|
||||
vect_lXtr['it'] = vectorizer.fit_transform(lXtr['it'])
|
||||
# print(type(vect_lXtr['it']))
|
||||
|
||||
corr = vect_lXtr['it'].T.dot(lytr['it'])
|
||||
# print(corr.shape)
|
||||
sum_correlated_class = corr.sum(axis=0)
|
||||
print(len(sum_correlated_class))
|
||||
print(sum_correlated_class.max())
|
||||
|
||||
|
||||
w2idx = vectorizer.vocabulary_
|
||||
idx2w = {v:k for k,v in w2idx.items()}
|
||||
|
||||
word_tot_corr = corr.sum(axis=1)
|
||||
print(word_tot_corr.shape)
|
||||
dict_word_tot_corr = {v:k for k,v in enumerate(word_tot_corr)}
|
||||
|
||||
sorted_word_tot_corr = np.sort(word_tot_corr)
|
||||
sorted_word_tot_corr = sorted_word_tot_corr[len(sorted_word_tot_corr)-200:]
|
||||
|
||||
top_idx = [dict_word_tot_corr[k] for k in sorted_word_tot_corr]
|
||||
print([idx2w[idx] for idx in top_idx])
|
||||
print([elem for elem in top_idx])
|
||||
print(corr[8709])
|
||||
print('Finished...')
|
||||
|
|
@ -8,7 +8,8 @@ from models.helpers import *
|
|||
class RNNMultilingualClassifier(nn.Module):
|
||||
|
||||
def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None,
|
||||
drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False):
|
||||
drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False,
|
||||
bert_embeddings=False):
|
||||
|
||||
super(RNNMultilingualClassifier, self).__init__()
|
||||
self.output_size = output_size
|
||||
|
|
@ -16,6 +17,7 @@ class RNNMultilingualClassifier(nn.Module):
|
|||
self.drop_embedding_range = drop_embedding_range
|
||||
self.drop_embedding_prop = drop_embedding_prop
|
||||
self.post_probabilities = post_probabilities
|
||||
self.bert_embeddings = bert_embeddings
|
||||
assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
|
||||
|
||||
self.lpretrained_embeddings = nn.ModuleDict()
|
||||
|
|
@ -56,19 +58,24 @@ class RNNMultilingualClassifier(nn.Module):
|
|||
|
||||
if only_post:
|
||||
self.label = nn.Linear(output_size, output_size)
|
||||
elif post_probabilities:
|
||||
elif post_probabilities and not bert_embeddings:
|
||||
self.label = nn.Linear(ff2 + output_size, output_size)
|
||||
elif bert_embeddings and not post_probabilities:
|
||||
self.label = nn.Linear(ff2 + 768, output_size)
|
||||
elif post_probabilities and bert_embeddings:
|
||||
self.label = nn.Linear(ff2 + output_size + 768, output_size)
|
||||
else:
|
||||
self.label = nn.Linear(ff2, output_size)
|
||||
|
||||
|
||||
def forward(self, input, post, lang):
|
||||
def forward(self, input, post, bert_embed, lang):
|
||||
if self.only_post:
|
||||
doc_embedding = post
|
||||
else:
|
||||
doc_embedding = self.transform(input, lang)
|
||||
if self.post_probabilities:
|
||||
doc_embedding = torch.cat([doc_embedding, post], dim=1)
|
||||
if self.bert_embeddings:
|
||||
doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1)
|
||||
|
||||
logits = self.label(doc_embedding)
|
||||
return logits
|
||||
|
|
@ -94,3 +101,14 @@ class RNNMultilingualClassifier(nn.Module):
|
|||
self.lpretrained_embeddings[l].requires_grad = True
|
||||
self.lpretrained_embeddings[l].weight.requires_grad = True
|
||||
|
||||
def get_embeddings(self, input, lang):
|
||||
batch_size = input.shape[0]
|
||||
input = embed(self, input, lang)
|
||||
input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
|
||||
training=self.training)
|
||||
input = input.permute(1, 0, 2)
|
||||
h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda())
|
||||
output, _ = self.rnn(input, h_0)
|
||||
output = output[-1, :, :]
|
||||
return output.cpu().detach().numpy()
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,249 @@
|
|||
from copy import deepcopy
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig
|
||||
from sklearn.model_selection import train_test_split
|
||||
from util.evaluation import *
|
||||
from time import time
|
||||
|
||||
|
||||
def predict(logits, classification_type='multilabel'):
|
||||
if classification_type == 'multilabel':
|
||||
prediction = torch.sigmoid(logits) > 0.5
|
||||
elif classification_type == 'singlelabel':
|
||||
prediction = torch.argmax(logits, dim=1).view(-1, 1)
|
||||
else:
|
||||
print('unknown classification type')
|
||||
|
||||
return prediction.detach().cpu().numpy()
|
||||
|
||||
|
||||
class TrainingDataset(Dataset):
|
||||
"""
|
||||
data: dict of lang specific tokenized data
|
||||
labels: dict of lang specific targets
|
||||
"""
|
||||
|
||||
def __init__(self, data, labels):
|
||||
self.langs = data.keys()
|
||||
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
|
||||
|
||||
for i, lang in enumerate(self.langs):
|
||||
_data = data[lang]['input_ids']
|
||||
_data = np.array(_data)
|
||||
_labels = labels[lang]
|
||||
_lang_value = np.full(len(_data), self.lang_ids[lang])
|
||||
|
||||
if i == 0:
|
||||
self.data = _data
|
||||
self.labels = _labels
|
||||
self.lang_index = _lang_value
|
||||
else:
|
||||
self.data = np.vstack((self.data, _data))
|
||||
self.labels = np.vstack((self.labels, _labels))
|
||||
self.lang_index = np.concatenate((self.lang_index, _lang_value))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
x = self.data[idx]
|
||||
y = self.labels[idx]
|
||||
lang = self.lang_index[idx]
|
||||
|
||||
return x, torch.tensor(y, dtype=torch.float), lang
|
||||
|
||||
def get_lang_ids(self):
|
||||
return self.lang_ids
|
||||
|
||||
def get_nclasses(self):
|
||||
if hasattr(self, 'labels'):
|
||||
return len(self.labels[0])
|
||||
else:
|
||||
print('Method called before init!')
|
||||
|
||||
|
||||
class ExtractorDataset(Dataset):
|
||||
"""
|
||||
data: dict of lang specific tokenized data
|
||||
labels: dict of lang specific targets
|
||||
"""
|
||||
|
||||
def __init__(self, data):
|
||||
self.langs = data.keys()
|
||||
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
|
||||
|
||||
for i, lang in enumerate(self.langs):
|
||||
_data = data[lang]['input_ids']
|
||||
_data = np.array(_data)
|
||||
_lang_value = np.full(len(_data), self.lang_ids[lang])
|
||||
|
||||
if i == 0:
|
||||
self.data = _data
|
||||
self.lang_index = _lang_value
|
||||
else:
|
||||
self.data = np.vstack((self.data, _data))
|
||||
self.lang_index = np.concatenate((self.lang_index, _lang_value))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
x = self.data[idx]
|
||||
lang = self.lang_index[idx]
|
||||
|
||||
return x, lang
|
||||
|
||||
def get_lang_ids(self):
|
||||
return self.lang_ids
|
||||
|
||||
|
||||
def get_model(n_out):
|
||||
print('# Initializing model ...')
|
||||
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
|
||||
return model
|
||||
|
||||
|
||||
def init_optimizer(model, lr, weight_decay=0):
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in model.named_parameters()
|
||||
if not any(nd in n for nd in no_decay)],
|
||||
'weight_decay': weight_decay},
|
||||
{'params': [p for n, p in model.named_parameters()
|
||||
if any(nd in n for nd in no_decay)],
|
||||
'weight_decay': weight_decay}
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
|
||||
return optimizer
|
||||
|
||||
|
||||
def get_lr(optimizer):
|
||||
for param_group in optimizer.param_groups:
|
||||
return param_group['lr']
|
||||
|
||||
|
||||
def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
|
||||
l_split_va = deepcopy(l_tokenized_tr)
|
||||
l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
|
||||
l_split_tr = deepcopy(l_tokenized_tr)
|
||||
l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
|
||||
|
||||
for lang in l_tokenized_tr.keys():
|
||||
val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
|
||||
l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[
|
||||
lang] = \
|
||||
train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size,
|
||||
random_state=seed, shuffle=True)
|
||||
|
||||
return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
|
||||
|
||||
|
||||
def do_tokenization(l_dataset, max_len=512, verbose=True):
|
||||
if verbose:
|
||||
print('# Starting Tokenization ...')
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
||||
langs = l_dataset.keys()
|
||||
l_tokenized = {}
|
||||
for lang in langs:
|
||||
l_tokenized[lang] = tokenizer(l_dataset[lang],
|
||||
truncation=True,
|
||||
max_length=max_len,
|
||||
padding='max_length')
|
||||
return l_tokenized
|
||||
|
||||
|
||||
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10):
|
||||
# _dataset_path = opt.dataset.split('/')[-1].split('_')
|
||||
# dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
dataset_id = 'TODO fix this!'
|
||||
|
||||
loss_history = []
|
||||
model.train()
|
||||
|
||||
for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
|
||||
optim.zero_grad()
|
||||
out = model(batch.cuda())
|
||||
logits = out[0]
|
||||
loss = criterion(logits, target.cuda())
|
||||
loss.backward()
|
||||
# clip_gradient(model)
|
||||
optim.step()
|
||||
loss_history.append(loss.item())
|
||||
|
||||
if idx % log_interval == 0:
|
||||
interval_loss = np.mean(loss_history[log_interval:])
|
||||
print(
|
||||
f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
|
||||
|
||||
mean_loss = np.mean(interval_loss)
|
||||
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
|
||||
return mean_loss
|
||||
|
||||
|
||||
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix):
|
||||
print('# Validating model ...')
|
||||
loss_history = []
|
||||
model.eval()
|
||||
langs = lang_ids.keys()
|
||||
id_2_lang = {v: k for k, v in lang_ids.items()}
|
||||
predictions = {l: [] for l in langs}
|
||||
yte_stacked = {l: [] for l in langs}
|
||||
|
||||
for batch, target, lang_idx in test_dataloader:
|
||||
out = model(batch.cuda())
|
||||
logits = out[0]
|
||||
loss = criterion(logits, target.cuda()).item()
|
||||
prediction = predict(logits)
|
||||
loss_history.append(loss)
|
||||
|
||||
# Assigning prediction to dict in predictions and yte_stacked according to lang_idx
|
||||
for i, pred in enumerate(prediction):
|
||||
lang_pred = id_2_lang[lang_idx.numpy()[i]]
|
||||
predictions[lang_pred].append(pred)
|
||||
yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
|
||||
|
||||
ly = {l: np.vstack(yte_stacked[l]) for l in langs}
|
||||
ly_ = {l: np.vstack(predictions[l]) for l in langs}
|
||||
l_eval = evaluate(ly, ly_)
|
||||
metrics = []
|
||||
for lang in langs:
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
if measure_prefix == 'te':
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
||||
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
||||
|
||||
mean_loss = np.mean(loss_history)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
|
||||
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
|
||||
|
||||
return Mf1
|
||||
|
||||
|
||||
def feature_extractor(data, lang_ids, model):
|
||||
print('# Feature Extractor Mode...')
|
||||
"""
|
||||
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
|
||||
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
|
||||
"""
|
||||
all_batch_embeddings = {}
|
||||
id2lang = {v: k for k, v in lang_ids.items()}
|
||||
with torch.no_grad():
|
||||
for batch, lang_idx in data:
|
||||
# for batch, target, lang_idx in data:
|
||||
out = model(batch.cuda())
|
||||
last_hidden_state = out[1][-1]
|
||||
batch_embeddings = last_hidden_state[:, 0, :]
|
||||
for i, l_idx in enumerate(lang_idx.numpy()):
|
||||
if id2lang[l_idx] not in all_batch_embeddings.keys():
|
||||
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
|
||||
else:
|
||||
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
|
||||
batch_embeddings[i].detach().cpu().numpy()))
|
||||
|
||||
return all_batch_embeddings, id2lang
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
|
||||
#logfile=../log/log_FunBert_jrc.csv
|
||||
#
|
||||
#runs='0 1 2 3 4'
|
||||
#for run in $runs
|
||||
#do
|
||||
# dataset=$dataset_path$run.pickle
|
||||
# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile #--tunable
|
||||
#done
|
||||
|
||||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
|
||||
logfile=../log/log_FunBert_fulljrc_static.csv
|
||||
|
||||
python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||
#logfile=../log/log_FunBert_rcv_static.csv
|
||||
#
|
||||
#runs='0 1 2 3 4'
|
||||
#for run in $runs
|
||||
#do
|
||||
# dataset=$dataset_path$run.pickle
|
||||
# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile
|
||||
#done
|
||||
|
||||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
|
||||
logfile=../log/log_FunBert_fullrcv_static.csv
|
||||
|
||||
python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
|
||||
#logfile=../log/log_mBert_jrc_NEW.csv
|
||||
#
|
||||
#runs='0 1 2 3 4'
|
||||
#for run in $runs
|
||||
#do
|
||||
# dataset=$dataset_path$run.pickle
|
||||
# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
|
||||
#done
|
||||
|
||||
logfile=../log/log_mBert_fulljrc.csv
|
||||
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
|
||||
python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
|
||||
|
|
@ -1,11 +1,15 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||
logfile=../log/log_Mbert_rcv.csv
|
||||
#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
|
||||
#logfile=../log/log_mBert_rcv_NEW.csv
|
||||
#
|
||||
#runs='0 1 2 3 4'
|
||||
#for run in $runs
|
||||
#do
|
||||
# dataset=$dataset_path$run.pickle
|
||||
# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
|
||||
#done
|
||||
|
||||
runs='1 2 3 4 5 6 7 8 9'
|
||||
for run in $runs
|
||||
do
|
||||
dataset=$dataset_path$run.pickle
|
||||
python new_mbert.py --dataset $dataset --log-file $logfile --nepochs=5 --weight_decay=0.01 --lr=1e-5
|
||||
done
|
||||
logfile=../log/log_mBert_fullrcv.csv
|
||||
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
|
||||
python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=30 --patience 3
|
||||
|
|
@ -1,15 +1,14 @@
|
|||
import subprocess
|
||||
import warnings
|
||||
import time
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.model_selection import train_test_split
|
||||
from embeddings.supervised import get_supervised_embeddings
|
||||
from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
# from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
from scipy.sparse import vstack, issparse
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
|
||||
|
||||
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
|
||||
|
|
@ -161,12 +160,13 @@ class Index:
|
|||
def none_dict(langs):
|
||||
return {l:None for l in langs}
|
||||
|
||||
|
||||
class MultilingualIndex:
|
||||
def __init__(self): #, add_language_trace=False):
|
||||
self.l_index = {}
|
||||
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
# self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000)
|
||||
# self.add_language_trace=add_language_trace
|
||||
# self.add_language_trace=add_language_trace}
|
||||
|
||||
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
|
||||
self.langs = sorted(l_devel_raw.keys())
|
||||
|
|
@ -184,6 +184,8 @@ class MultilingualIndex:
|
|||
for l,index in self.l_index.items():
|
||||
index.train_val_split(val_prop, max_val, seed=seed)
|
||||
|
||||
|
||||
|
||||
def embedding_matrices(self, lpretrained, supervised):
|
||||
lXtr = self.get_lXtr() if supervised else none_dict(self.langs)
|
||||
lYtr = self.l_train_target() if supervised else none_dict(self.langs)
|
||||
|
|
@ -191,52 +193,133 @@ class MultilingualIndex:
|
|||
index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
|
||||
self.sup_range = index.wce_range
|
||||
|
||||
# experimental... does it make sense to keep track of the language? i.e., to inform the network from which
|
||||
# language does the data came from...
|
||||
# if self.add_language_trace and pretrained_embeddings is not None:
|
||||
# print('adding language trace')
|
||||
# lang_trace = torch.zeros(size=(vocabsize, len(self.langs)))
|
||||
# lang_trace[:,i]=1
|
||||
# pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1)
|
||||
# TODO circular import with transformers --> when generating posterior prob, we import PosteriorProbabilitiesEmbedder which is defined in transformers
|
||||
# def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
|
||||
# # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
|
||||
# timeit = time.time()
|
||||
# lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
|
||||
# lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
|
||||
# if not stored_post:
|
||||
# for l in self.langs:
|
||||
# n_elements = lXtr[l].shape[0]
|
||||
# if n_elements > max_training_docs_by_lang:
|
||||
# choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
|
||||
# lXtr[l] = lXtr[l][choice]
|
||||
# lYtr[l] = lYtr[l][choice]
|
||||
#
|
||||
# # train the posterior probabilities embedder
|
||||
# print('[posteriors] training a calibrated SVM')
|
||||
# learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
|
||||
# prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
|
||||
# prob_embedder.fit(lXtr, lYtr)
|
||||
#
|
||||
# # transforms the training, validation, and test sets into posterior probabilities
|
||||
# print('[posteriors] generating posterior probabilities')
|
||||
# lPtr = prob_embedder.transform(self.get_lXtr())
|
||||
# lPva = prob_embedder.transform(self.get_lXva())
|
||||
# lPte = prob_embedder.transform(self.get_lXte())
|
||||
# # NB: Check splits indices !
|
||||
# if store_posteriors:
|
||||
# import pickle
|
||||
# with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
|
||||
# pickle.dump([lPtr, lPva, lPte], outfile)
|
||||
# print(f'Successfully dumped posteriors!')
|
||||
# else:
|
||||
# import pickle
|
||||
# with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
|
||||
# lPtr, lPva, lPte = pickle.load(infile)
|
||||
# print(f'Successfully loaded stored posteriors!')
|
||||
# print(f'[posteriors] done in {time.time() - timeit}')
|
||||
# return lPtr, lPva, lPte
|
||||
|
||||
def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False):
|
||||
show_gpu('GPU memory before initializing mBert model:')
|
||||
# TODO: load dumped embeddings?
|
||||
from main_mbert_extractor import do_tokenization, ExtractorDataset, DataLoader
|
||||
from transformers import BertConfig, BertForSequenceClassification
|
||||
|
||||
print('[mBERT] generating mBERT doc embeddings')
|
||||
lXtr_raw = self.get_raw_lXtr()
|
||||
lXva_raw = self.get_raw_lXva()
|
||||
lXte_raw = self.get_raw_lXte()
|
||||
|
||||
print('# Tokenizing datasets')
|
||||
l_tokenized_tr = do_tokenization(lXtr_raw, max_len=max_len, verbose=False)
|
||||
tr_dataset = ExtractorDataset(l_tokenized_tr)
|
||||
tr_lang_ids = tr_dataset.lang_ids
|
||||
tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=False)
|
||||
|
||||
l_tokenized_va = do_tokenization(lXva_raw, max_len=max_len, verbose=False)
|
||||
va_dataset = ExtractorDataset(l_tokenized_va)
|
||||
va_lang_ids = va_dataset.lang_ids
|
||||
va_dataloader = DataLoader(va_dataset, batch_size=batch_size, shuffle=False)
|
||||
|
||||
l_tokenized_te = do_tokenization(lXte_raw, max_len=max_len, verbose=False)
|
||||
te_dataset = ExtractorDataset(l_tokenized_te)
|
||||
te_lang_ids = te_dataset.lang_ids
|
||||
te_dataloader = DataLoader(te_dataset, batch_size=batch_size, shuffle=False)
|
||||
|
||||
num_labels = self.l_index[self.langs[0]].val_target.shape[1]
|
||||
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
|
||||
num_labels=num_labels)
|
||||
model = BertForSequenceClassification.from_pretrained(bert_path,
|
||||
config=config).cuda()
|
||||
print('# Extracting document embeddings')
|
||||
tr_bert_embeddings, id2lang_tr = self.do_bert_embeddings(model, tr_dataloader, tr_lang_ids, verbose=False)
|
||||
va_bert_embeddings, id2lang_va = self.do_bert_embeddings(model, va_dataloader, va_lang_ids, verbose=False)
|
||||
te_bert_embeddings, id2lang_te = self.do_bert_embeddings(model, te_dataloader, te_lang_ids, verbose=False)
|
||||
|
||||
show_gpu('GPU memory before after mBert model:')
|
||||
# Freeing GPU's memory
|
||||
import gc
|
||||
del model, tr_dataloader, va_dataloader, te_dataloader
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
show_gpu('GPU memory after clearing cache:')
|
||||
return tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings
|
||||
|
||||
|
||||
def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
|
||||
# choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
|
||||
timeit = time.time()
|
||||
lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
|
||||
lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
|
||||
if not stored_post:
|
||||
for l in self.langs:
|
||||
n_elements = lXtr[l].shape[0]
|
||||
if n_elements > max_training_docs_by_lang:
|
||||
choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
|
||||
lXtr[l] = lXtr[l][choice]
|
||||
lYtr[l] = lYtr[l][choice]
|
||||
|
||||
# train the posterior probabilities embedder
|
||||
print('[posteriors] training a calibrated SVM')
|
||||
learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
|
||||
prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
|
||||
prob_embedder.fit(lXtr, lYtr)
|
||||
|
||||
# transforms the training, validation, and test sets into posterior probabilities
|
||||
print('[posteriors] generating posterior probabilities')
|
||||
lPtr = prob_embedder.transform(self.get_lXtr())
|
||||
lPva = prob_embedder.transform(self.get_lXva())
|
||||
lPte = prob_embedder.transform(self.get_lXte())
|
||||
# NB: Check splits indices !
|
||||
if store_posteriors:
|
||||
import pickle
|
||||
with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
|
||||
pickle.dump([lPtr, lPva, lPte], outfile)
|
||||
print(f'Successfully dumped posteriors!')
|
||||
@staticmethod
|
||||
def do_bert_embeddings(model, data, lang_ids, verbose=True):
|
||||
if verbose:
|
||||
print('# Feature Extractor Mode...')
|
||||
all_batch_embeddings = {}
|
||||
id2lang = {v: k for k, v in lang_ids.items()}
|
||||
with torch.no_grad():
|
||||
for batch, lang_idx in data:
|
||||
out = model(batch.cuda())
|
||||
last_hidden_state = out[1][-1]
|
||||
batch_embeddings = last_hidden_state[:, 0, :]
|
||||
for i, l_idx in enumerate(lang_idx.numpy()):
|
||||
if id2lang[l_idx] not in all_batch_embeddings.keys():
|
||||
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
|
||||
else:
|
||||
import pickle
|
||||
with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
|
||||
lPtr, lPva, lPte = pickle.load(infile)
|
||||
print(f'Successfully loaded stored posteriors!')
|
||||
print(f'[posteriors] done in {time.time() - timeit}')
|
||||
return lPtr, lPva, lPte
|
||||
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
|
||||
batch_embeddings[i].detach().cpu().numpy()))
|
||||
|
||||
return all_batch_embeddings, id2lang
|
||||
|
||||
def get_raw_lXtr(self):
|
||||
lXtr_raw = {k:[] for k in self.langs}
|
||||
lYtr_raw = {k: [] for k in self.langs}
|
||||
for lang in self.langs:
|
||||
lXtr_raw[lang] = self.l_index[lang].train_raw
|
||||
lYtr_raw[lang] = self.l_index[lang].train_raw
|
||||
return lXtr_raw
|
||||
|
||||
def get_raw_lXva(self):
|
||||
lXva_raw = {k: [] for k in self.langs}
|
||||
for lang in self.langs:
|
||||
lXva_raw[lang] = self.l_index[lang].val_raw
|
||||
|
||||
return lXva_raw
|
||||
|
||||
def get_raw_lXte(self):
|
||||
lXte_raw = {k: [] for k in self.langs}
|
||||
for lang in self.langs:
|
||||
lXte_raw[lang] = self.l_index[lang].test_raw
|
||||
|
||||
return lXte_raw
|
||||
|
||||
def get_lXtr(self):
|
||||
if not hasattr(self, 'lXtr'):
|
||||
|
|
@ -277,6 +360,12 @@ class MultilingualIndex:
|
|||
def l_test_index(self):
|
||||
return {l: index.test_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_devel_index(self):
|
||||
return {l: index.devel_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_devel_target(self):
|
||||
return {l: index.devel_target for l, index in self.l_index.items()}
|
||||
|
||||
def l_train(self):
|
||||
return self.l_train_index(), self.l_train_target()
|
||||
|
||||
|
|
@ -284,7 +373,6 @@ class MultilingualIndex:
|
|||
return self.l_val_index(), self.l_val_target()
|
||||
|
||||
|
||||
|
||||
class Batch:
|
||||
def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
|
||||
self.batchsize = batchsize
|
||||
|
|
@ -297,7 +385,7 @@ class Batch:
|
|||
def init_offset(self):
|
||||
self.offset = {lang: 0 for lang in self.languages}
|
||||
|
||||
def batchify(self, l_index, l_post, llabels):
|
||||
def batchify(self, l_index, l_post, l_bert, llabels): # TODO: add bert embedding here...
|
||||
langs = self.languages
|
||||
l_num_samples = {l:len(l_index[l]) for l in langs}
|
||||
|
||||
|
|
@ -322,6 +410,10 @@ class Batch:
|
|||
if l_post is not None:
|
||||
post = torch.FloatTensor(l_post[lang][batch_slice]).cuda()
|
||||
|
||||
bert_emb = None
|
||||
if l_bert is not None:
|
||||
bert_emb = torch.FloatTensor(l_bert[lang][batch_slice]).cuda()
|
||||
|
||||
batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
|
||||
|
||||
batch = torch.LongTensor(batch).cuda()
|
||||
|
|
@ -329,7 +421,7 @@ class Batch:
|
|||
|
||||
self.offset[lang] = limit
|
||||
|
||||
yield batch, post, target, lang
|
||||
yield batch, post, bert_emb, target, lang
|
||||
|
||||
|
||||
def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500):
|
||||
|
|
@ -384,7 +476,81 @@ def count_parameters(model):
|
|||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
|
||||
def show_gpu(msg):
|
||||
"""
|
||||
ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4
|
||||
"""
|
||||
|
||||
def query(field):
|
||||
return (subprocess.check_output(
|
||||
['nvidia-smi', f'--query-gpu={field}',
|
||||
'--format=csv,nounits,noheader'],
|
||||
encoding='utf-8'))
|
||||
|
||||
def to_int(result):
|
||||
return int(result.strip().split('\n')[0])
|
||||
|
||||
used = to_int(query('memory.used'))
|
||||
total = to_int(query('memory.total'))
|
||||
pct = used / total
|
||||
print('\n' + msg, f'{100 * pct:2.1f}% ({used} out of {total})')
|
||||
|
||||
|
||||
class TfidfVectorizerMultilingual:
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.kwargs = kwargs
|
||||
|
||||
def fit(self, lX, ly=None):
|
||||
self.langs = sorted(lX.keys())
|
||||
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
|
||||
|
||||
def fit_transform(self, lX, ly=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def vocabulary(self, l=None):
|
||||
if l is None:
|
||||
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].vocabulary_
|
||||
|
||||
def get_analyzer(self, l=None):
|
||||
if l is None:
|
||||
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].build_analyzer()
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear', C=1):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False)
|
||||
|
||||
|
||||
def get_params(optimc=False):
|
||||
if not optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = 'rbf'
|
||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
||||
|
||||
def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru,
|
||||
gruMUSE, gruWCE, agg, allprob):
|
||||
_id = '-'
|
||||
_id_conf = [posteriors, supervised, pretrained, mbert, gru]
|
||||
_id_name = ['X', 'W', 'M', 'B', 'G']
|
||||
for i, conf in enumerate(_id_conf):
|
||||
if conf:
|
||||
_id += _id_name[i]
|
||||
_id = _id if not gruMUSE else _id + '_muse'
|
||||
_id = _id if not gruWCE else _id + '_wce'
|
||||
_id = _id if not agg else _id + '_mean'
|
||||
_id = _id if not allprob else _id + '_allprob'
|
||||
|
||||
_dataset_path = dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
return _id, dataset_id
|
||||
|
||||
|
|
|
|||
|
|
@ -1,12 +1,13 @@
|
|||
#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
|
||||
import torch
|
||||
from transformers import BertForSequenceClassification
|
||||
from time import time
|
||||
from util.file import create_if_not_exist
|
||||
import warnings
|
||||
|
||||
class EarlyStopping:
|
||||
|
||||
def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
|
||||
def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt', is_bert=False):
|
||||
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
|
||||
self.patience_limit = patience
|
||||
self.patience = patience
|
||||
|
|
@ -18,6 +19,7 @@ class EarlyStopping:
|
|||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.STOP = False
|
||||
self.is_bert = is_bert
|
||||
|
||||
def __call__(self, watch_score, epoch):
|
||||
|
||||
|
|
@ -30,6 +32,11 @@ class EarlyStopping:
|
|||
self.stop_time = time()
|
||||
if self.checkpoint:
|
||||
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
|
||||
if self.is_bert:
|
||||
print(f'Serializing Huggingface model...')
|
||||
create_if_not_exist(self.checkpoint)
|
||||
self.model.save_pretrained(self.checkpoint)
|
||||
else:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
torch.save(self.model, self.checkpoint)
|
||||
|
|
@ -54,6 +61,9 @@ class EarlyStopping:
|
|||
|
||||
def restore_checkpoint(self):
|
||||
print(f'restoring best model from epoch {self.best_epoch}...')
|
||||
if self.is_bert:
|
||||
return BertForSequenceClassification.from_pretrained(self.checkpoint)
|
||||
else:
|
||||
return torch.load(self.checkpoint)
|
||||
|
||||
def print(self, msg):
|
||||
|
|
|
|||
|
|
@ -5,18 +5,21 @@ from sklearn.metrics import f1_score
|
|||
import numpy as np
|
||||
import time
|
||||
|
||||
|
||||
def evaluation_metrics(y, y_):
|
||||
if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
|
||||
raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
|
||||
else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
|
||||
return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
|
||||
|
||||
|
||||
def soft_evaluation_metrics(y, y_):
|
||||
if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
|
||||
raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
|
||||
else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
|
||||
return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_)
|
||||
|
||||
|
||||
def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
|
||||
print('evaluation (n_jobs={})'.format(n_jobs))
|
||||
if n_jobs == 1:
|
||||
|
|
@ -26,6 +29,7 @@ def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
|
|||
evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs)
|
||||
return {lang: evals[i] for i, lang in enumerate(langs)}
|
||||
|
||||
|
||||
def average_results(l_eval, show=True):
|
||||
metrics = []
|
||||
for lang in l_eval.keys():
|
||||
|
|
@ -60,6 +64,7 @@ def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, retu
|
|||
else:
|
||||
return eval_
|
||||
|
||||
|
||||
def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False):
|
||||
print('prediction for test in a single language')
|
||||
if predictor is None:
|
||||
|
|
@ -72,6 +77,7 @@ def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=Fa
|
|||
ly_ = predictor({lang:X})
|
||||
return metrics(y, ly_[lang])
|
||||
|
||||
|
||||
def get_binary_counters(polylingual_method, lX, ly, predictor=None):
|
||||
print('prediction for test')
|
||||
assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
|
||||
|
|
@ -87,6 +93,7 @@ def get_binary_counters(polylingual_method, lX, ly, predictor=None):
|
|||
evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs)
|
||||
return {lang: evals[i] for i, lang in enumerate(langs)}
|
||||
|
||||
|
||||
def binary_counters(y, y_):
|
||||
y = np.reshape(y, (-1))
|
||||
assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected'
|
||||
|
|
|
|||
|
|
@ -0,0 +1,91 @@
|
|||
from optparse import OptionParser
|
||||
|
||||
parser = OptionParser(usage="usage: %prog datapath [options]")
|
||||
|
||||
parser.add_option("-d", dest='dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset')
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='../log/multiModal_log.csv')
|
||||
|
||||
parser.add_option("-X", "--posteriors", dest="posteriors", action='store_true',
|
||||
help="Add posterior probabilities to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-W", "--supervised", dest="supervised", action='store_true',
|
||||
help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-M", "--pretrained", dest="pretrained", action='store_true',
|
||||
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-B", "--mbert", dest="mbert", action='store_true',
|
||||
help="Add multilingual Bert (mBert) document embedding representation", default=False)
|
||||
|
||||
parser.add_option('-G', dest='gruViewGenerator', action='store_true',
|
||||
help="Add document embedding generated via recurrent net (GRU)", default=False)
|
||||
|
||||
parser.add_option("--l2", dest="l2", action='store_true',
|
||||
help="Activates l2 normalization as a post-processing for the document embedding views",
|
||||
default=False)
|
||||
|
||||
parser.add_option("--allprob", dest="allprob", action='store_true',
|
||||
help="All views are generated as posterior probabilities. This affects the supervised and pretrained"
|
||||
"embeddings, for which a calibrated classifier is generated, which generates the posteriors",
|
||||
default=False)
|
||||
|
||||
parser.add_option("--feat-weight", dest="feat_weight",
|
||||
help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf')
|
||||
|
||||
parser.add_option("-w", "--we-path", dest="we_path",
|
||||
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c", type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
||||
help="Optimize hyperparameters", default=False)
|
||||
|
||||
parser.add_option("-j", "--n_jobs", dest="n_jobs", type=int,
|
||||
help="Number of parallel jobs (default is -1, all)", default=-1)
|
||||
|
||||
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
||||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
||||
default=300)
|
||||
|
||||
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
|
||||
help="Remove common component when computing dot product of word embedding matrices", default=False)
|
||||
|
||||
parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
|
||||
help="Z-score normalize matrices (WCE and MUSE)", default=False)
|
||||
|
||||
parser.add_option("-a", "--agg", dest="agg", action='store_true',
|
||||
help="Set aggregation function of the common Z-space to average (Default: concatenation)",
|
||||
default=False)
|
||||
|
||||
# ------------------------------------------------------------------------------------
|
||||
|
||||
parser.add_option('--hidden', type=int, default=512, metavar='int',
|
||||
help='hidden lstm size (default: 512)')
|
||||
|
||||
parser.add_option('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]',
|
||||
help='dropout probability for the supervised matrix (default: 0.5)')
|
||||
|
||||
parser.add_option('--tunable', action='store_true', default=False,
|
||||
help='pretrained embeddings are tunable from the beginning (default False, i.e., static)')
|
||||
|
||||
parser.add_option('--logfile_gru', dest='logfile_gru', default='../log/log_gru_viewgenerator.csv')
|
||||
|
||||
parser.add_option('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
|
||||
|
||||
parser.add_option('--force', action='store_true', default=False,
|
||||
help='do not check if this experiment has already been run')
|
||||
|
||||
parser.add_option('--gruMuse', dest='gruMUSE', action='store_true', default=False,
|
||||
help='Deploy MUSE embedding as embedding layer of the GRU View Generator')
|
||||
|
||||
parser.add_option('--gruWce', dest='gruWCE', action='store_true', default=False,
|
||||
help='Deploy WCE embedding as embedding layer of the GRU View Generator')
|
||||
|
||||
parser.add_option('--gru-path', dest='gru_path', default=None,
|
||||
help='Set the path to a pretrained GRU model (aka, -G view generator)')
|
||||
|
||||
parser.add_option('--bert-path', dest='bert_path', default=None,
|
||||
help='Set the path to a pretrained mBERT model (aka, -B view generator)')
|
||||
|
|
@ -9,7 +9,7 @@ class StandardizeTransformer:
|
|||
self.range = range
|
||||
|
||||
def fit(self, X):
|
||||
print('fitting Standardizer')
|
||||
print('fitting Standardizer...')
|
||||
std=np.std(X, axis=self.axis, ddof=1)
|
||||
self.std = np.clip(std, 1e-5, None)
|
||||
self.mean = np.mean(X, axis=self.axis)
|
||||
|
|
@ -21,7 +21,6 @@ class StandardizeTransformer:
|
|||
self.std = ones
|
||||
self.mean = zeros
|
||||
self.yetfit=True
|
||||
print('done\n')
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
|
|
|
|||
Loading…
Reference in New Issue