First commit

This commit is contained in:
andrea 2020-10-22 15:11:49 +02:00
parent 4a7a594a41
commit 90f24dab8e
21 changed files with 1541 additions and 400 deletions

View File

@ -121,11 +121,10 @@ class MultilingualDataset:
print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape)) print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape))
def show_category_prevalences(self): def show_category_prevalences(self):
#pass
nC = self.num_categories() nC = self.num_categories()
accum_tr = np.zeros(nC, dtype=np.int) accum_tr = np.zeros(nC, dtype=np.int)
accum_te = np.zeros(nC, dtype=np.int) accum_te = np.zeros(nC, dtype=np.int)
in_langs = np.zeros(nC, dtype=np.int) #count languages with at least one positive example (per category) in_langs = np.zeros(nC, dtype=np.int) # count languages with at least one positive example (per category)
for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
if lang not in self.langs(): continue if lang not in self.langs(): continue
prev_train = np.sum(self.cat_view(Ytr), axis=0) prev_train = np.sum(self.cat_view(Ytr), axis=0)

View File

@ -47,7 +47,6 @@ class FastTextWikiNews(Vectors):
class FastTextMUSE(PretrainedEmbeddings): class FastTextMUSE(PretrainedEmbeddings):
def __init__(self, path, lang, limit=None): def __init__(self, path, lang, limit=None):
super().__init__() super().__init__()
print(f'Loading fastText pretrained vectors for language {lang} from {path}')
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
self.embed = FastTextWikiNews(path, lang, max_vectors=limit) self.embed = FastTextWikiNews(path, lang, max_vectors=limit)

14
src/extract_features.sh Normal file
View File

@ -0,0 +1,14 @@
#!/usr/bin/env bash
dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run#
runs='1 2 3 4 5 6 7 8 9'
for run in $runs
do
dataset=$dataset_path$run.pickle
modelpath=/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run$runs
python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath
done
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath

View File

@ -133,7 +133,8 @@ class MonolingualClassifier:
self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs) self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs)
else: else:
self.model = self.learner self.model = self.learner
raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in the labels across languages') raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in '
'the labels across languages')
# parameter optimization? # parameter optimization?
if self.parameters: if self.parameters:
@ -141,7 +142,8 @@ class MonolingualClassifier:
self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs, self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs,
error_score=0, verbose=10) error_score=0, verbose=10)
print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}') # print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}')
print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}')
self.model.fit(X, y) self.model.fit(X, y)
if isinstance(self.model, GridSearchCV): if isinstance(self.model, GridSearchCV):
self.best_params_ = self.model.best_params_ self.best_params_ = self.model.best_params_

View File

@ -1,65 +1,39 @@
import numpy as np from torch.optim.lr_scheduler import StepLR
from sklearn.feature_extraction.text import TfidfVectorizer from torch.utils.data import DataLoader
#from data.text_preprocessor import NLTKStemTokenizer from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain
from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain, \
gain_ratio, gss
from embeddings.embeddings import FastTextMUSE from embeddings.embeddings import FastTextMUSE
from embeddings.supervised import supervised_embeddings_tfidf, zscores from embeddings.supervised import supervised_embeddings_tfidf, zscores
from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling
import time
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from joblib import Parallel, delayed from scipy.sparse import hstack
from scipy.sparse import issparse, vstack, hstack
from util_transformers.StandardizeTransformer import StandardizeTransformer from util_transformers.StandardizeTransformer import StandardizeTransformer
from util.SIF_embed import remove_pc from util.SIF_embed import remove_pc
from sklearn.preprocessing import normalize from sklearn.preprocessing import normalize
from sklearn.svm import SVC
from scipy.sparse import csr_matrix from scipy.sparse import csr_matrix
from models.mBert import *
from models.lstm_class import *
from util.csv_log import CSVLog
from util.file import get_file_name
from util.early_stop import EarlyStopping
from util.common import *
import time
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Data Processing # Data Processing
# ------------------------------------------------------------------ # ------------------------------------------------------------------
class TfidfVectorizerMultilingual:
def __init__(self, **kwargs):
self.kwargs=kwargs
def fit(self, lX, ly=None):
self.langs = sorted(lX.keys())
self.vectorizer={l:TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
# tokenizer=NLTKStemTokenizer(l, verbose=True),
return self
def transform(self, lX):
return {l:self.vectorizer[l].transform(lX[l]) for l in self.langs}
def fit_transform(self, lX, ly=None):
return self.fit(lX,ly).transform(lX)
def vocabulary(self, l=None):
if l is None:
return {l:self.vectorizer[l].vocabulary_ for l in self.langs}
else:
return self.vectorizer[l].vocabulary_
def get_analyzer(self, l=None):
if l is None:
return {l:self.vectorizer[l].build_analyzer() for l in self.langs}
else:
return self.vectorizer[l].build_analyzer()
class FeatureWeight: class FeatureWeight:
def __init__(self, weight='tfidf', agg='mean'): def __init__(self, weight='tfidf', agg='mean'):
assert weight in ['tfidf', 'pmi', 'ig'] or callable(weight), 'weight should either be "tfidf" or a callable function' assert weight in ['tfidf', 'pmi', 'ig'] or callable(
weight), 'weight should either be "tfidf" or a callable function'
assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"' assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"'
self.weight = weight self.weight = weight
self.agg = agg self.agg = agg
self.fitted = False self.fitted = False
if weight=='pmi': if weight == 'pmi':
self.weight = pointwise_mutual_information self.weight = pointwise_mutual_information
elif weight == 'ig': elif weight == 'ig':
self.weight = information_gain self.weight = information_gain
@ -91,8 +65,10 @@ class FeatureWeight:
return self.fit(lX, ly).transform(lX) return self.fit(lX, ly).transform(lX)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Document Embeddings # View Generators (aka first-tier learners)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
class PosteriorProbabilitiesEmbedder: class PosteriorProbabilitiesEmbedder:
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1): def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1):
@ -103,9 +79,13 @@ class PosteriorProbabilitiesEmbedder:
self.doc_projector = NaivePolylingualClassifier( self.doc_projector = NaivePolylingualClassifier(
self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
) )
self.requires_tfidf = True
def fit(self, lX, lY, lV=None): def fit(self, lX, lY, lV=None, called_by_viewgen=False):
print('fitting the projectors... {}'.format(lX.keys())) if not called_by_viewgen:
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
print('### Posterior Probabilities View Generator (X)')
print('fitting the projectors... {}'.format(lX.keys()))
self.doc_projector.fit(lX, lY) self.doc_projector.fit(lX, lY)
return self return self
@ -124,7 +104,7 @@ class PosteriorProbabilitiesEmbedder:
return self.doc_projector.predict(lX) return self.doc_projector.predict(lX)
def predict_proba(self, lX, ly=None): def predict_proba(self, lX, ly=None):
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents') print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents')
return self.doc_projector.predict_proba(lX) return self.doc_projector.predict_proba(lX)
def _get_output_dim(self): def _get_output_dim(self):
@ -134,19 +114,22 @@ class PosteriorProbabilitiesEmbedder:
class MuseEmbedder: class MuseEmbedder:
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False): def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False):
self.path=path self.path = path
self.lV = lV self.lV = lV
self.l2 = l2 self.l2 = l2
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.featureweight = featureweight self.featureweight = featureweight
self.sif = sif self.sif = sif
self.requires_tfidf = True
def fit(self, lX, ly, lV=None): def fit(self, lX, ly, lV=None):
assert lV is not None or self.lV is not None, 'lV not specified' assert lV is not None or self.lV is not None, 'lV not specified'
print('### MUSE View Generator (M)')
print(f'Loading fastText pretrained vectors for languages {list(lX.keys())}...')
self.langs = sorted(lX.keys()) self.langs = sorted(lX.keys())
self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs) self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs} lWordList = {l: self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE.items()} self.MUSE = {l: Muse.extract(lWordList[l]).numpy() for l, Muse in self.MUSE.items()}
self.featureweight.fit(lX, ly) self.featureweight.fit(lX, ly)
return self return self
@ -175,16 +158,19 @@ class WordClassEmbedder:
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False): def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False):
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.l2 = l2 self.l2 = l2
self.max_label_space=max_label_space self.max_label_space = max_label_space
self.featureweight = featureweight self.featureweight = featureweight
self.sif = sif self.sif = sif
self.requires_tfidf = True
def fit(self, lX, ly, lV=None): def fit(self, lX, ly, lV=None):
print('### WCE View Generator (M)')
print('Computing supervised embeddings...')
self.langs = sorted(lX.keys()) self.langs = sorted(lX.keys())
WCE = Parallel(n_jobs=self.n_jobs)( WCE = Parallel(n_jobs=self.n_jobs)(
delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs
) )
self.lWCE = {l:WCE[i] for i,l in enumerate(self.langs)} self.lWCE = {l: WCE[i] for i, l in enumerate(self.langs)}
self.featureweight.fit(lX, ly) self.featureweight.fit(lX, ly)
return self return self
@ -192,7 +178,7 @@ class WordClassEmbedder:
lWCE = self.lWCE lWCE = self.lWCE
lX = self.featureweight.transform(lX) lX = self.featureweight.transform(lX)
XdotWCE = Parallel(n_jobs=self.n_jobs)( XdotWCE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], lWCE[lang], self.sif)for lang in self.langs delayed(XdotM)(lX[lang], lWCE[lang], self.sif) for lang in self.langs
) )
lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)} lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
lwce = _normalize(lwce, self.l2) lwce = _normalize(lwce, self.l2)
@ -202,31 +188,284 @@ class WordClassEmbedder:
return self.fit(lX, ly).transform(lX) return self.fit(lX, ly).transform(lX)
def _get_output_dim(self): def _get_output_dim(self):
return 73 return 73 # TODO !
class MBertEmbedder:
def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None,
nC=None):
self.doc_embed_path = doc_embed_path
self.patience = patience
self.checkpoint_dir = checkpoint_dir
self.fitted = False
self.requires_tfidf = False
if path_to_model is None and nC is not None:
self.model = None
else:
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
num_labels=nC)
self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda()
self.fitted = True
def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1):
print('### mBERT View Generator (B)')
if self.fitted is True:
print('Bert model already fitted!')
return self
print('Fine-tune mBert on the given dataset.')
l_tokenized_tr = do_tokenization(lX, max_len=512)
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly,
val_prop=0.2, max_val=2000,
seed=seed) # TODO: seed
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
nC = tr_dataset.get_nclasses()
model = get_model(nC)
model = model.cuda()
criterion = torch.nn.BCEWithLogitsLoss().cuda()
optim = init_optimizer(model, lr=lr, weight_decay=0.01)
lr_scheduler = StepLR(optim, step_size=25, gamma=0.1)
early_stop = EarlyStopping(model, optimizer=optim, patience=self.patience,
checkpoint=self.checkpoint_dir,
is_bert=True)
# Training loop
logfile = '../log/log_mBert_extractor.csv'
method_name = 'mBert_feature_extractor'
tinit = time()
lang_ids = va_dataset.lang_ids
for epoch in range(1, nepochs + 1):
print('# Start Training ...')
train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile)
lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
# Validation
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
early_stop(macrof1, epoch)
if early_stop.STOP:
print('[early-stop] STOP')
break
model = early_stop.restore_checkpoint()
self.model = model.cuda()
if val_epochs > 0:
print(f'running last {val_epochs} training epochs on the validation set')
for val_epoch in range(1, val_epochs + 1):
train(self.model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile)
self.fitted = True
return self
def transform(self, lX):
assert self.fitted is True, 'Calling transform without any initialized model! - call init first or on init' \
'pass the "path_to_model" arg.'
print('Obtaining document embeddings from pretrained mBert ')
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
feat_dataset = ExtractorDataset(l_tokenized_X)
feat_lang_ids = feat_dataset.lang_ids
dataloader = DataLoader(feat_dataset, batch_size=64)
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
return all_batch_embeddings
def fit_transform(self, lX, ly, lV=None):
return self.fit(lX, ly).transform(lX)
class RecurrentEmbedder:
def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3,
we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10,
test_each=0, checkpoint_dir='../checkpoint', model_path=None):
self.pretrained = pretrained
self.supervised = supervised
self.concat = concat
self.requires_tfidf = False
self.multilingual_dataset = multilingual_dataset
self.model = None
self.we_path = we_path
self.langs = multilingual_dataset.langs()
self.hidden_size = hidden_size
self.sup_drop = sup_drop
self.posteriors = posteriors
self.patience = patience
self.checkpoint_dir = checkpoint_dir
self.test_each = test_each
self.options = options
self.seed = options.seed
self.is_trained = False
## INIT MODEL for training
self.lXtr, self.lytr = self.multilingual_dataset.training(target_as_csr=True)
self.lXte, self.lyte = self.multilingual_dataset.test(target_as_csr=True)
self.nC = self.lyte[self.langs[0]].shape[1]
lpretrained, lpretrained_vocabulary = self._load_pretrained_embeddings(self.we_path, self.langs)
self.multilingual_index = MultilingualIndex()
self.multilingual_index.index(self.lXtr, self.lytr, self.lXte, lpretrained_vocabulary)
self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed)
self.multilingual_index.embedding_matrices(lpretrained, self.supervised)
if model_path is not None:
self.is_trained = True
self.model = torch.load(model_path)
else:
self.model = self._init_Net()
self.optim = init_optimizer(self.model, lr=lr)
self.criterion = torch.nn.BCEWithLogitsLoss().cuda()
self.lr_scheduler = StepLR(self.optim, step_size=25, gamma=0.5)
self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience,
checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}')
# Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities
self.posteriorEmbedder = MetaClassifier(
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs)
def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1):
print('### Gated Recurrent Unit View Generator (G)')
if not self.is_trained:
# Batchify input
self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed)
l_train_index, l_train_target = self.multilingual_index.l_train()
l_val_index, l_val_target = self.multilingual_index.l_val()
l_test_index = self.multilingual_index.l_test_index()
batcher_train = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
lpad=self.multilingual_index.l_pad())
batcher_eval = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
lpad=self.multilingual_index.l_pad())
# Train loop
print('Start training')
method_name = 'gru_view_generator'
logfile = init_logfile_nn(method_name, self.options)
tinit = time.time()
for epoch in range(1, nepochs + 1):
train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target,
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
ltrain_bert=None)
self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
# validation step
macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch,
logfile, self.criterion, 'va')
self.early_stop(macrof1, epoch)
if self.test_each > 0:
test_gru(self.model, batcher_eval, l_test_index, None, None, self.lyte, tinit, epoch,
logfile, self.criterion, 'te')
if self.early_stop.STOP:
print('[early-stop] STOP')
print('Restoring best model...')
break
self.model = self.early_stop.restore_checkpoint()
print(f'running last {val_epochs} training epochs on the validation set')
for val_epoch in range(1, val_epochs+1):
batcher_train.init_offset()
train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target,
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
ltrain_bert=None)
self.is_trained = True
# Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities
lX = self._get_doc_embeddings(lX)
# Fit a ''multi-lingual'' SVM on the generated doc embeddings
self.posteriorEmbedder.fit(lX, ly)
return self
def transform(self, lX, batch_size=64):
lX = self._get_doc_embeddings(lX)
return self.posteriorEmbedder.predict_proba(lX)
def fit_transform(self, lX, ly, lV=None):
# TODO
return 0
def _get_doc_embeddings(self, lX, batch_size=64):
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
print('Generating document embeddings via GRU')
lX = {}
ly = {}
batcher_transform = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
lpad=self.multilingual_index.l_pad())
l_devel_index = self.multilingual_index.l_devel_index()
l_devel_target = self.multilingual_index.l_devel_target()
for idx, (batch, post, bert_emb, target, lang) in enumerate(
batcher_transform.batchify(l_devel_index, None, None, l_devel_target)):
if lang not in lX.keys():
lX[lang] = self.model.get_embeddings(batch, lang)
ly[lang] = target.cpu().detach().numpy()
else:
lX[lang] = np.concatenate((lX[lang], self.model.get_embeddings(batch, lang)), axis=0)
ly[lang] = np.concatenate((ly[lang], target.cpu().detach().numpy()), axis=0)
return lX
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
def _load_pretrained_embeddings(self, we_path, langs):
lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ?
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
return lpretrained, lpretrained_vocabulary
def _none_dict(self, langs):
return {l:None for l in langs}
# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested
def _init_Net(self, xavier_uniform=True):
model = RNNMultilingualClassifier(
output_size=self.nC,
hidden_size=self.hidden_size,
lvocab_size=self.multilingual_index.l_vocabsize(),
learnable_length=0,
lpretrained=self.multilingual_index.l_embeddings(),
drop_embedding_range=self.multilingual_index.sup_range,
drop_embedding_prop=self.sup_drop,
post_probabilities=self.posteriors
)
return model.cuda()
class DocEmbedderList: class DocEmbedderList:
def __init__(self, *embedder_list, aggregation='concat'): def __init__(self, *embedder_list, aggregation='concat'):
assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"' assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"'
if len(embedder_list)==0: embedder_list=[] if len(embedder_list) == 0:
embedder_list = []
self.embedders = embedder_list self.embedders = embedder_list
self.aggregation = aggregation self.aggregation = aggregation
print(f'Aggregation mode: {self.aggregation}') print(f'Aggregation mode: {self.aggregation}')
def fit(self, lX, ly, lV=None): def fit(self, lX, ly, lV=None, tfidf=None):
for transformer in self.embedders: for transformer in self.embedders:
transformer.fit(lX,ly,lV) _lX = lX
if transformer.requires_tfidf:
_lX = tfidf
transformer.fit(_lX, ly, lV)
return self return self
def transform(self, lX): def transform(self, lX, tfidf=None):
if self.aggregation == 'concat': if self.aggregation == 'concat':
return self.transform_concat(lX) return self.transform_concat(lX, tfidf)
elif self.aggregation == 'mean': elif self.aggregation == 'mean':
return self.transform_mean(lX) return self.transform_mean(lX, tfidf)
def transform_concat(self, lX): def transform_concat(self, lX, tfidf):
if len(self.embedders)==1: if len(self.embedders) == 1:
if self.embedders[0].requires_tfidf:
lX = tfidf
return self.embedders[0].transform(lX) return self.embedders[0].transform(lX)
some_sparse = False some_sparse = False
@ -234,32 +473,41 @@ class DocEmbedderList:
lZparts = {l: [] for l in langs} lZparts = {l: [] for l in langs}
for transformer in self.embedders: for transformer in self.embedders:
lZ = transformer.transform(lX) _lX = lX
if transformer.requires_tfidf:
_lX = tfidf
lZ = transformer.transform(_lX)
for l in langs: for l in langs:
Z = lZ[l] Z = lZ[l]
some_sparse = some_sparse or issparse(Z) some_sparse = some_sparse or issparse(Z)
lZparts[l].append(Z) lZparts[l].append(Z)
hstacker = hstack if some_sparse else np.hstack hstacker = hstack if some_sparse else np.hstack
return {l:hstacker(lZparts[l]) for l in langs} return {l: hstacker(lZparts[l]) for l in langs}
def transform_mean(self, lX): def transform_mean(self, lX, tfidf):
if len(self.embedders)==1: if len(self.embedders) == 1:
return self.embedders[0].transform(lX) return self.embedders[0].transform(lX)
langs = sorted(lX.keys()) langs = sorted(lX.keys())
lZparts = {l: None for l in langs} lZparts = {l: None for l in langs}
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders]) # min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
min_dim = 300 min_dim = 73 # TODO <---- this should be the number of target classes
for transformer in self.embedders: for transformer in self.embedders:
lZ = transformer.transform(lX) _lX = lX
if transformer.requires_tfidf:
_lX = tfidf
lZ = transformer.transform(_lX)
nC = min([lZ[lang].shape[1] for lang in langs]) nC = min([lZ[lang].shape[1] for lang in langs])
for l in langs: for l in langs:
Z = lZ[l] Z = lZ[l]
if Z.shape[1] > min_dim: if Z.shape[1] > min_dim:
print(f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.' print(
f'Applying PCA(n_components={min_dim})') f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
f'Applying PCA(n_components={min_dim})')
pca = PCA(n_components=min_dim) pca = PCA(n_components=min_dim)
Z = pca.fit(Z).transform(Z) Z = pca.fit(Z).transform(Z)
if lZparts[l] is None: if lZparts[l] is None:
@ -268,12 +516,11 @@ class DocEmbedderList:
lZparts[l] += Z lZparts[l] += Z
n_transformers = len(self.embedders) n_transformers = len(self.embedders)
nC = min([lZparts[lang].shape[1] for lang in langs])
return {l:lZparts[l] / n_transformers for l in langs} return {l: lZparts[l] / n_transformers for l in langs}
def fit_transform(self, lX, ly, lV=None): def fit_transform(self, lX, ly, lV=None, tfidf=None):
return self.fit(lX, ly, lV).transform(lX) return self.fit(lX, ly, lV, tfidf).transform(lX, tfidf)
def best_params(self): def best_params(self):
return {'todo'} return {'todo'}
@ -283,11 +530,13 @@ class DocEmbedderList:
class FeatureSet2Posteriors: class FeatureSet2Posteriors:
def __init__(self, transformer, l2=True, n_jobs=-1): def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1):
self.transformer = transformer self.transformer = transformer
self.l2=l2 self.l2 = l2
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.prob_classifier = MetaClassifier(SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) self.prob_classifier = MetaClassifier(
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
self.requires_tfidf = requires_tfidf
def fit(self, lX, ly, lV=None): def fit(self, lX, ly, lV=None):
if lV is None and hasattr(self.transformer, 'lV'): if lV is None and hasattr(self.transformer, 'lV'):
@ -314,12 +563,12 @@ class FeatureSet2Posteriors:
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Meta-Classifier # Meta-Classifier (aka second-tier learner)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
class MetaClassifier: class MetaClassifier:
def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None): def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None):
self.n_jobs=n_jobs self.n_jobs = n_jobs
self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs) self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
self.standardize_range = standardize_range self.standardize_range = standardize_range
@ -354,34 +603,37 @@ class MetaClassifier:
def best_params(self): def best_params(self):
return self.model.best_params() return self.model.best_params()
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Ensembling # Ensembling (aka Funnelling)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
class Funnelling: class Funnelling:
def __init__(self, def __init__(self,
vectorizer:TfidfVectorizerMultilingual, vectorizer: TfidfVectorizerMultilingual,
first_tier:DocEmbedderList, first_tier: DocEmbedderList,
meta:MetaClassifier): meta: MetaClassifier):
self.vectorizer = vectorizer self.vectorizer = vectorizer
self.first_tier = first_tier self.first_tier = first_tier
self.meta = meta self.meta = meta
self.n_jobs = meta.n_jobs self.n_jobs = meta.n_jobs
def fit(self, lX, ly): def fit(self, lX, ly):
lX = self.vectorizer.fit_transform(lX, ly) tfidf_lX = self.vectorizer.fit_transform(lX, ly)
lV = self.vectorizer.vocabulary() lV = self.vectorizer.vocabulary()
lZ = self.first_tier.fit_transform(lX, ly, lV) print('## Fitting first-tier learners!')
lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX)
print('## Fitting meta-learner!')
self.meta.fit(lZ, ly) self.meta.fit(lZ, ly)
def predict(self, lX, ly=None): def predict(self, lX, ly=None):
lX = self.vectorizer.transform(lX) tfidf_lX = self.vectorizer.transform(lX)
lZ = self.first_tier.transform(lX) lZ = self.first_tier.transform(lX, tfidf=tfidf_lX)
ly_ = self.meta.predict(lZ) ly_ = self.meta.predict(lZ)
return ly_ return ly_
def best_params(self): def best_params(self):
return {'1st-tier':self.first_tier.best_params(), return {'1st-tier': self.first_tier.best_params(),
'meta':self.meta.best_params()} 'meta': self.meta.best_params()}
class Voting: class Voting:
@ -394,15 +646,14 @@ class Voting:
classifier.fit(lX, ly, lV) classifier.fit(lX, ly, lV)
def predict(self, lX, ly=None): def predict(self, lX, ly=None):
lP = {l: [] for l in lX.keys()}
lP = {l:[] for l in lX.keys()}
for classifier in self.prob_classifiers: for classifier in self.prob_classifiers:
lPi = classifier.predict_proba(lX) lPi = classifier.predict_proba(lX)
for l in lX.keys(): for l in lX.keys():
lP[l].append(lPi[l]) lP[l].append(lPi[l])
lP = {l:np.stack(Plist).mean(axis=0) for l,Plist in lP.items()} lP = {l: np.stack(Plist).mean(axis=0) for l, Plist in lP.items()}
ly = {l:P>0.5 for l,P in lP.items()} ly = {l: P > 0.5 for l, P in lP.items()}
return ly return ly
@ -419,7 +670,6 @@ def load_muse_embeddings(we_path, langs, n_jobs=-1):
def word_class_embedding_matrix(X, Y, max_label_space=300): def word_class_embedding_matrix(X, Y, max_label_space=300):
print('computing supervised embeddings...')
WCE = supervised_embeddings_tfidf(X, Y) WCE = supervised_embeddings_tfidf(X, Y)
WCE = zscores(WCE, axis=0) WCE = zscores(WCE, axis=0)
@ -433,9 +683,7 @@ def word_class_embedding_matrix(X, Y, max_label_space=300):
return WCE return WCE
def XdotM(X,M, sif): def XdotM(X, M, sif):
# return X.dot(M)
print(f'X={X.shape}, M={M.shape}')
E = X.dot(M) E = X.dot(M)
if sif: if sif:
print("removing pc...") print("removing pc...")
@ -444,6 +692,137 @@ def XdotM(X,M, sif):
def _normalize(lX, l2=True): def _normalize(lX, l2=True):
return {l: normalize(X) for l, X in lX.items()} if l2 else lX return {l: normalize(X) for l, X in lX.items()} if l2 else lX
class BatchGRU:
def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
self.batchsize = batchsize
self.batches_per_epoch = batches_per_epoch
self.languages = languages
self.lpad=lpad
self.max_pad_length=max_pad_length
self.init_offset()
def init_offset(self):
self.offset = {lang: 0 for lang in self.languages}
def batchify(self, l_index, l_post, l_bert, llabels):
langs = self.languages
l_num_samples = {l:len(l_index[l]) for l in langs}
max_samples = max(l_num_samples.values())
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches:
n_batches = self.batches_per_epoch
for b in range(n_batches):
for lang in langs:
index, labels = l_index[lang], llabels[lang]
offset = self.offset[lang]
if offset >= l_num_samples[lang]:
offset = 0
limit = offset+self.batchsize
batch_slice = slice(offset, limit)
batch = index[batch_slice]
batch_labels = labels[batch_slice].toarray()
post = None
bert_emb = None
batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
batch = torch.LongTensor(batch).cuda()
target = torch.FloatTensor(batch_labels).cuda()
self.offset[lang] = limit
yield batch, post, bert_emb, target, lang
def pad(index_list, pad_index, max_pad_length=None):
pad_length = np.max([len(index) for index in index_list])
if max_pad_length is not None:
pad_length = min(pad_length, max_pad_length)
for i,indexes in enumerate(index_list):
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
return index_list
def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, optim, epoch, method_name, opt,
ltrain_posteriors=None, ltrain_bert=None, log_interval=10):
_dataset_path = opt.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
loss_history = []
model.train()
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
optim.zero_grad()
loss = criterion(model(batch, post, bert_emb, lang), target)
loss.backward()
clip_gradient(model)
optim.step()
loss_history.append(loss.item())
if idx % log_interval == 0:
interval_loss = np.mean(loss_history[-log_interval:])
print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, '
f'Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time.time() - tinit)
return mean_loss
def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix):
loss_history = []
model.eval()
langs = sorted(ltest_index.keys())
predictions = {l: [] for l in langs}
yte_stacked = {l: [] for l in langs}
batcher.init_offset()
for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte),
desc='evaluation: '):
logits = model(batch, post, bert_emb, lang)
loss = criterion(logits, target).item()
prediction = predict(logits)
predictions[lang].append(prediction)
yte_stacked[lang].append(target.detach().cpu().numpy())
loss_history.append(loss)
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l:np.vstack(predictions[l]) for l in langs}
l_eval = evaluate(ly, ly_)
metrics = []
for lang in langs:
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix == 'te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
mean_loss = np.mean(loss_history)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time.time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time.time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time.time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time.time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time.time() - tinit)
return Mf1
def clip_gradient(model, clip_value=1e-1):
params = list(filter(lambda p: p.grad is not None, model.parameters()))
for p in params:
p.grad.data.clamp_(-clip_value, clip_value)
def init_logfile_nn(method_name, opt):
logfile = CSVLog(opt.logfile_gru, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
logfile.set_default('dataset', opt.dataset)
logfile.set_default('run', opt.seed)
logfile.set_default('method', method_name)
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
f'and run {opt.seed} already calculated'
return logfile

View File

@ -12,6 +12,7 @@ from time import time
from tqdm import tqdm from tqdm import tqdm
from util.evaluation import evaluate from util.evaluation import evaluate
from util.file import get_file_name from util.file import get_file_name
# import pickle
allowed_nets = {'rnn'} allowed_nets = {'rnn'}
@ -34,7 +35,8 @@ def init_Net(nC, multilingual_index, xavier_uniform=True):
drop_embedding_range=multilingual_index.sup_range, drop_embedding_range=multilingual_index.sup_range,
drop_embedding_prop=opt.sup_drop, drop_embedding_prop=opt.sup_drop,
post_probabilities=opt.posteriors, post_probabilities=opt.posteriors,
only_post=only_post only_post=only_post,
bert_embeddings=opt.mbert
) )
# weight initialization # weight initialization
@ -59,8 +61,10 @@ def set_method_name():
method_name += f'-WCE' method_name += f'-WCE'
if opt.posteriors: if opt.posteriors:
method_name += f'-Posteriors' method_name += f'-Posteriors'
if opt.mbert:
method_name += f'-mBert'
if (opt.pretrained or opt.supervised) and opt.tunable: if (opt.pretrained or opt.supervised) and opt.tunable:
method_name+='-(trainable)' method_name += '-(trainable)'
else: else:
method_name += '-(static)' method_name += '-(static)'
if opt.learnable > 0: if opt.learnable > 0:
@ -77,7 +81,8 @@ def init_logfile(method_name, opt):
logfile.set_default('dataset', opt.dataset) logfile.set_default('dataset', opt.dataset)
logfile.set_default('run', opt.seed) logfile.set_default('run', opt.seed)
logfile.set_default('method', method_name) logfile.set_default('method', method_name)
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated' assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
f'and run {opt.seed} already calculated'
return logfile return logfile
@ -90,15 +95,83 @@ def load_pretrained_embeddings(we_path, langs):
return lpretrained, lpretrained_vocabulary return lpretrained, lpretrained_vocabulary
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def train(model, batcher, ltrain_index, ltrain_posteriors, ltrain_bert, lytr, tinit, logfile, criterion, optim, epoch, method_name):
_dataset_path = opt.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
loss_history = []
model.train()
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
optim.zero_grad()
# _out = model(batch, post, bert_emb, lang)
loss = criterion(model(batch, post, bert_emb, lang), target)
loss.backward()
clip_gradient(model)
optim.step()
loss_history.append(loss.item())
if idx % opt.log_interval == 0:
interval_loss = np.mean(loss_history[-opt.log_interval:])
print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
return mean_loss
def test(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix):
loss_history = []
model.eval()
langs = sorted(ltest_index.keys())
predictions = {l:[] for l in langs}
yte_stacked = {l:[] for l in langs}
batcher.init_offset()
for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), desc='evaluation: '):
logits = model(batch, post, bert_emb, lang)
loss = criterion(logits, target).item()
prediction = predict(logits)
predictions[lang].append(prediction)
yte_stacked[lang].append(target.detach().cpu().numpy())
loss_history.append(loss)
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l:np.vstack(predictions[l]) for l in langs}
l_eval = evaluate(ly, ly_)
metrics = []
for lang in langs:
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix == 'te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
mean_loss = np.mean(loss_history)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
return Mf1
# ---------------------------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------------------------
def main(): def main():
DEBUGGING = False
method_name = set_method_name() method_name = set_method_name()
logfile = init_logfile(method_name, opt) logfile = init_logfile(method_name, opt)
# Loading the dataset # Loading the dataset
data = MultilingualDataset.load(opt.dataset) data = MultilingualDataset.load(opt.dataset)
data.set_view(languages=['de', 'fr']) #, 'it', 'en']) # 'sv', 'da', 'es', 'it']) # data.set_view(languages=['it', 'fr']) # Testing with less langs
data.show_dimensions() data.show_dimensions()
langs = data.langs() langs = data.langs()
l_devel_raw, l_devel_target = data.training(target_as_csr=True) l_devel_raw, l_devel_target = data.training(target_as_csr=True)
@ -114,25 +187,36 @@ def main():
multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed) multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
multilingual_index.embedding_matrices(lpretrained, opt.supervised) multilingual_index.embedding_matrices(lpretrained, opt.supervised)
if opt.posteriors: if opt.posteriors:
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000, store_posteriors=True) #stored_post=True) #opt.svm_max_docs) if DEBUGGING:
import pickle
with open('/home/andreapdr/funneling_pdr/dumps/posteriors_jrc_run0.pickle', 'rb') as infile:
data_post = pickle.load(infile)
lPtr = data_post[0]
lPva = data_post[1]
lPte = data_post[2]
print('## DEBUGGING MODE: loaded dumped posteriors for jrc run0')
else:
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000)
else: else:
lPtr, lPva, lPte = None, None, None lPtr, lPva, lPte = None, None, None
# just_test = False if opt.mbert:
# if just_test: _dataset_path = opt.dataset.split('/')[-1].split('_')
# _model_folder = _dataset_path[0] + '_' + _dataset_path[-1].replace('.pickle', '')
# model = torch.load( # print(f'Model Folder: {_model_folder}')
# '../checkpoint/rnn(H512)-Muse-WCE-Posteriors-(trainable)-jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')
# criterion = torch.nn.BCEWithLogitsLoss().cuda() if DEBUGGING:
# with open('/home/andreapdr/funneling_pdr/dumps/mBert_jrc_run0.pickle', 'rb') as infile:
# # batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad()) data_embed = pickle.load(infile)
# tr_bert_embeddings = data_embed[0]
# batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad()) va_bert_embeddings = data_embed[1]
# l_test_index = multilingual_index.l_test_index() te_bert_embeddings = data_embed[2]
# epoch = 1 print('## DEBUGGING MODE: loaded dumped mBert embeddings for jrc run0')
# tinit = time() else:
# test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te') tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings \
# exit('Loaded') = multilingual_index.bert_embeddings(f'/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-{_model_folder}/')
else:
tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings = None, None, None
# Model initialization # Model initialization
model = init_Net(data.num_categories(), multilingual_index) model = init_Net(data.num_categories(), multilingual_index)
@ -141,11 +225,12 @@ def main():
criterion = torch.nn.BCEWithLogitsLoss().cuda() criterion = torch.nn.BCEWithLogitsLoss().cuda()
lr_scheduler = StepLR(optim, step_size=25, gamma=0.5) lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad()) batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad()) batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
tinit = time() tinit = time()
create_if_not_exist(opt.checkpoint_dir) create_if_not_exist(opt.checkpoint_dir)
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}') early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
l_train_index, l_train_target = multilingual_index.l_train() l_train_index, l_train_target = multilingual_index.l_train()
l_val_index, l_val_target = multilingual_index.l_val() l_val_index, l_val_target = multilingual_index.l_val()
@ -154,11 +239,11 @@ def main():
print('-'*80) print('-'*80)
print('Start training') print('Start training')
for epoch in range(1, opt.nepochs + 1): for epoch in range(1, opt.nepochs + 1):
train(model, batcher_train, l_train_index, lPtr, l_train_target, tinit, logfile, criterion, optim, epoch, method_name) train(model, batcher_train, l_train_index, lPtr, tr_bert_embeddings, l_train_target, tinit, logfile, criterion, optim, epoch, method_name)
lr_scheduler.step() # reduces the learning rate lr_scheduler.step() # reduces the learning rate
# validation # validation
macrof1 = test(model, batcher_eval, l_val_index, lPva, l_val_target, tinit, epoch, logfile, criterion, 'va') macrof1 = test(model, batcher_eval, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, epoch, logfile, criterion, 'va')
early_stop(macrof1, epoch) early_stop(macrof1, epoch)
if opt.test_each>0: if opt.test_each>0:
if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs): if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
@ -186,78 +271,11 @@ def main():
print(f'running last {opt.val_epochs} training epochs on the validation set') print(f'running last {opt.val_epochs} training epochs on the validation set')
for val_epoch in range(1, opt.val_epochs + 1): for val_epoch in range(1, opt.val_epochs + 1):
batcher_train.init_offset() batcher_train.init_offset()
train(model, batcher_train, l_val_index, lPva, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name) train(model, batcher_train, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name)
# final test # final test
print('Training complete: testing') print('Training complete: testing')
test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te') test(model, batcher_eval, l_test_index, lPte, te_bert_embeddings, l_test_target, tinit, epoch, logfile, criterion, 'te')
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name):
_dataset_path = opt.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
loss_history = []
model.train()
for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)):
optim.zero_grad()
_out = model(batch,post, lang)
loss = criterion(model(batch, post, lang), target)
loss.backward()
clip_gradient(model)
optim.step()
loss_history.append(loss.item())
if idx % opt.log_interval == 0:
interval_loss = np.mean(loss_history[-opt.log_interval:])
print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
return mean_loss
def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix):
loss_history = []
model.eval()
langs = sorted(ltest_index.keys())
predictions = {l:[] for l in langs}
yte_stacked = {l:[] for l in langs}
batcher.init_offset()
for batch, post, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lyte), desc='evaluation: '):
logits = model(batch, post, lang)
loss = criterion(logits, target).item()
prediction = predict(logits)
predictions[lang].append(prediction)
yte_stacked[lang].append(target.detach().cpu().numpy())
loss_history.append(loss)
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l:np.vstack(predictions[l]) for l in langs}
l_eval = evaluate(ly, ly_)
metrics = []
for lang in langs:
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix=='te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
mean_loss = np.mean(loss_history)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
return Mf1
# ---------------------------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------------------------
@ -281,8 +299,6 @@ if __name__ == '__main__':
'language used to train the calibrated SVMs (only used if --posteriors is active)') 'language used to train the calibrated SVMs (only used if --posteriors is active)')
parser.add_argument('--log-interval', type=int, default=10, metavar='int', help='how many batches to wait before printing training status') parser.add_argument('--log-interval', type=int, default=10, metavar='int', help='how many batches to wait before printing training status')
parser.add_argument('--log-file', type=str, default='../log/log.csv', metavar='str', help='path to the log csv file') parser.add_argument('--log-file', type=str, default='../log/log.csv', metavar='str', help='path to the log csv file')
# parser.add_argument('--pickle-dir', type=str, default='../pickles', metavar='str', help=f'if set, specifies the path where to '
# f'save/load the dataset pickled (set to None if you prefer not to retain the pickle file)')
parser.add_argument('--test-each', type=int, default=0, metavar='int', help='how many epochs to wait before invoking test (default: 0, only at the end)') parser.add_argument('--test-each', type=int, default=0, metavar='int', help='how many epochs to wait before invoking test (default: 0, only at the end)')
parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', help='path to the directory containing checkpoints') parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', help='path to the directory containing checkpoints')
parser.add_argument('--net', type=str, default='rnn', metavar='str', help=f'net, one in {allowed_nets}') parser.add_argument('--net', type=str, default='rnn', metavar='str', help=f'net, one in {allowed_nets}')
@ -299,7 +315,9 @@ if __name__ == '__main__':
'(default 300)') '(default 300)')
parser.add_argument('--force', action='store_true', default=False, help='do not check if this experiment has already been run') parser.add_argument('--force', action='store_true', default=False, help='do not check if this experiment has already been run')
parser.add_argument('--tunable', action='store_true', default=False, parser.add_argument('--tunable', action='store_true', default=False,
help='pretrained embeddings are tunable from the begining (default False, i.e., static)') help='pretrained embeddings are tunable from the beginning (default False, i.e., static)')
parser.add_argument('--mbert', action='store_true', default=False,
help='use mBert embeddings')
opt = parser.parse_args() opt = parser.parse_args()

View File

@ -3,7 +3,7 @@ from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader from torch.utils.data import Dataset, DataLoader
import numpy as np import numpy as np
import torch import torch
from util.common import clip_gradient, predict from util.common import predict
from time import time from time import time
from util.csv_log import CSVLog from util.csv_log import CSVLog
from util.evaluation import evaluate from util.evaluation import evaluate
@ -12,6 +12,7 @@ from torch.optim.lr_scheduler import StepLR
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from copy import deepcopy from copy import deepcopy
import argparse import argparse
# from torch.utils.tensorboard import SummaryWriter
def check_sentences(sentences): def check_sentences(sentences):
@ -69,11 +70,14 @@ def get_dataset_name(datapath):
if id_split in possible_splits: if id_split in possible_splits:
dataset_name = splitted[0].split('/')[-1] dataset_name = splitted[0].split('/')[-1]
return f'{dataset_name}_run{id_split}' return f'{dataset_name}_run{id_split}'
elif splitted[-2].split('.')[0] == 'full':
dataset_name = splitted[0].split('/')[-1]
return f'{dataset_name}_fullrun'
def load_datasets(datapath): def load_datasets(datapath):
data = MultilingualDataset.load(datapath) data = MultilingualDataset.load(datapath)
# data.set_view(languages=['it'], categories=[0, 1, 2, 3, 4]) # Testing with less langs # data.set_view(languages=['it']) #, categories=[0, 1, 2, 3, 4]) # Testing with less langs
data.show_dimensions() data.show_dimensions()
l_devel_raw, l_devel_target = data.training(target_as_csr=False) l_devel_raw, l_devel_target = data.training(target_as_csr=False)
@ -82,8 +86,9 @@ def load_datasets(datapath):
return l_devel_raw, l_devel_target, l_test_raw, l_test_target return l_devel_raw, l_devel_target, l_test_raw, l_test_target
def do_tokenization(l_dataset, max_len=512): def do_tokenization(l_dataset, max_len=512, verbose=True):
print('# Starting Tokenization ...') if verbose:
print('# Starting Tokenization ...')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
langs = l_dataset.keys() langs = l_dataset.keys()
l_tokenized = {} l_tokenized = {}
@ -91,7 +96,6 @@ def do_tokenization(l_dataset, max_len=512):
l_tokenized[lang] = tokenizer(l_dataset[lang], l_tokenized[lang] = tokenizer(l_dataset[lang],
truncation=True, truncation=True,
max_length=max_len, max_length=max_len,
# add_special_tokens=True,
padding='max_length') padding='max_length')
return l_tokenized return l_tokenized
@ -162,7 +166,7 @@ def check_param_grad_status(model):
print('#' * 50) print('#' * 50)
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, val_step=False, val_dataloader=None, lang_ids=None): def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer):
_dataset_path = opt.dataset.split('/')[-1].split('_') _dataset_path = opt.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1] dataset_id = _dataset_path[0] + _dataset_path[-1]
@ -179,6 +183,10 @@ def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit,
optim.step() optim.step()
loss_history.append(loss.item()) loss_history.append(loss.item())
if writer is not None:
_n_step = (epoch - 1) * (len(train_dataloader)) + idx
writer.add_scalar('Loss_step/Train', loss, _n_step)
# Check tokenized sentences consistency # Check tokenized sentences consistency
# check_sentences(batch.cpu()) # check_sentences(batch.cpu())
@ -187,16 +195,12 @@ def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit,
print( print(
f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
# if val_step and idx % 100 == 0:
# macrof1 = test(model, val_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
# early_stop
mean_loss = np.mean(interval_loss) mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
return mean_loss return mean_loss
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix): def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix, writer):
print('# Validating model ...') print('# Validating model ...')
loss_history = [] loss_history = []
model.eval() model.eval()
@ -229,6 +233,8 @@ def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, mea
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
if writer is not None:
writer.add_scalars('Eval Metrics', {'Mf1': Mf1, 'mF1': mF1, 'MK': MK, 'mk':mk}, epoch)
mean_loss = np.mean(loss_history) mean_loss = np.mean(loss_history)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
@ -281,6 +287,7 @@ def main():
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True) va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False) te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False)
# Initializing model # Initializing model
nC = tr_dataset.get_nclasses() nC = tr_dataset.get_nclasses()
model = get_model(nC) model = get_model(nC)
@ -289,29 +296,31 @@ def main():
optim = init_optimizer(model, lr=opt.lr) optim = init_optimizer(model, lr=opt.lr)
lr_scheduler = StepLR(optim, step_size=25, gamma=0.1) lr_scheduler = StepLR(optim, step_size=25, gamma=0.1)
early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_dataset_name(opt.dataset)}') checkpoint=f'/home/andreapdr/funneling_pdr/hug_checkpoint/{method_name}-{get_dataset_name(opt.dataset)}',
# lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optim, num_warmup_steps= , num_training_steps=) is_bert=True)
# print(model)
# Freezing encoder # Freezing encoder
# model = freeze_encoder(model) # model = freeze_encoder(model)
check_param_grad_status(model) check_param_grad_status(model)
# Tensorboard logger
# writer = SummaryWriter('../log/tensorboard_logs/')
# Training loop # Training loop
tinit = time() tinit = time()
lang_ids = va_dataset.lang_ids lang_ids = va_dataset.lang_ids
for epoch in range(1, opt.nepochs + 1): for epoch in range(1, opt.nepochs + 1):
print('# Start Training ...') print('# Start Training ...')
train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile) train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer=None)
lr_scheduler.step() # reduces the learning rate lr_scheduler.step() # reduces the learning rate
# Validation # Validation
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va') macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va', writer=None)
early_stop(macrof1, epoch) early_stop(macrof1, epoch)
if opt.test_each > 0: if opt.test_each > 0:
if (opt.plotmode and (epoch == 1 or epoch % opt.test_each == 0)) or ( if (opt.plotmode and (epoch == 1 or epoch % opt.test_each == 0)) or (
not opt.plotmode and epoch % opt.test_each == 0 and epoch < opt.nepochs): not opt.plotmode and epoch % opt.test_each == 0 and epoch < opt.nepochs):
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te') test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None)
if early_stop.STOP: if early_stop.STOP:
print('[early-stop] STOP') print('[early-stop] STOP')
@ -323,16 +332,19 @@ def main():
print('Training over. Performing final evaluation') print('Training over. Performing final evaluation')
model = early_stop.restore_checkpoint() model = early_stop.restore_checkpoint()
model = model.cuda()
if opt.val_epochs > 0: if opt.val_epochs > 0:
print(f'running last {opt.val_epochs} training epochs on the validation set') print(f'running last {opt.val_epochs} training epochs on the validation set')
for val_epoch in range(1, opt.val_epochs + 1): for val_epoch in range(1, opt.val_epochs + 1):
train(model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile) train(model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile, writer=None)
# final test # final test
print('Training complete: testing') print('Training complete: testing')
test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te') test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None)
# writer.flush()
# writer.close()
exit('Code Executed!') exit('Code Executed!')
@ -372,6 +384,7 @@ if __name__ == '__main__':
# Testing different parameters ... # Testing different parameters ...
opt.weight_decay = 0.01 opt.weight_decay = 0.01
opt.lr = 1e-5 opt.lr = 1e-5
opt.patience = 5
main() main()
# TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size # TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size

110
src/main_mbert_extractor.py Normal file
View File

@ -0,0 +1,110 @@
from main_mbert import *
import pickle
class ExtractorDataset(Dataset):
"""
data: dict of lang specific tokenized data
labels: dict of lang specific targets
"""
def __init__(self, data):
self.langs = data.keys()
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
for i, lang in enumerate(self.langs):
_data = data[lang]['input_ids']
_data = np.array(_data)
_lang_value = np.full(len(_data), self.lang_ids[lang])
if i == 0:
self.data = _data
self.lang_index = _lang_value
else:
self.data = np.vstack((self.data, _data))
self.lang_index = np.concatenate((self.lang_index, _lang_value))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.data[idx]
lang = self.lang_index[idx]
return x, lang
def get_lang_ids(self):
return self.lang_ids
def feature_extractor(data, lang_ids, model_path='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0/'):
print('# Feature Extractor Mode...')
from transformers import BertConfig
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, num_labels=300)
model = BertForSequenceClassification.from_pretrained(model_path,
config=config).cuda()
"""
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
"""
all_batch_embeddings = {}
id2lang = {v:k for k,v in lang_ids.items()}
with torch.no_grad():
for batch, target, lang_idx in data:
out = model(batch.cuda())
last_hidden_state = out[1][-1]
batch_embeddings = last_hidden_state[:, 0, :]
for i, l_idx in enumerate(lang_idx.numpy()):
if id2lang[l_idx] not in all_batch_embeddings.keys():
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
else:
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
batch_embeddings[i].detach().cpu().numpy()))
return all_batch_embeddings, id2lang
def main():
print('Running main ...')
print(f'Model path: {opt.modelpath}\nDataset path: {opt.dataset}')
DATAPATH = opt.dataset
MAX_LEN = 512
l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN)
l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN)
tr_dataset = TrainingDataset(l_tokenized_tr, l_devel_target)
tr_lang_ids = tr_dataset.lang_ids
te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
te_lang_ids = te_dataset.lang_ids
tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc embeddings
te_dataloader = DataLoader(te_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc
tr_all_batch_embeddings, id2lang_tr = feature_extractor(tr_dataloader, tr_lang_ids, opt.modelpath) # Extracting doc embed for devel
with open(f'{opt.modelpath}/TR_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile:
pickle.dump((tr_all_batch_embeddings, id2lang_tr), outfile)
te_all_batch_embeddings, id2lang_te = feature_extractor(te_dataloader, te_lang_ids, opt.modelpath) # Extracting doc embed for test
with open(f'{opt.modelpath}/TE_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile:
pickle.dump((te_all_batch_embeddings, id2lang_te), outfile)
exit('Extraction completed!')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='mBert model document embedding extractor')
parser.add_argument('--dataset', type=str,
default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
metavar='datasetpath', help=f'path to the pickled dataset')
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
parser.add_argument('--modelpath', type=str, default='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0',
metavar='modelpath', help=f'path to pre-trained mBert model')
opt = parser.parse_args()
main()

View File

@ -2,102 +2,41 @@ import os
from dataset_builder import MultilingualDataset from dataset_builder import MultilingualDataset
from learning.transformers import * from learning.transformers import *
from util.evaluation import * from util.evaluation import *
from optparse import OptionParser
from util.file import exists from util.file import exists
from util.results import PolylingualClassificationResults from util.results import PolylingualClassificationResults
from sklearn.svm import SVC from util.common import *
from util.parser_options import *
parser = OptionParser(usage="usage: %prog datapath [options]")
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='multiModal_log.csv')
parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true',
help="Add posterior probabilities to the document embedding representation", default=False)
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
parser.add_option("--l2", dest="l2", action='store_true',
help="Activates l2 normalization as a post-processing for the document embedding views", default=False)
parser.add_option("--allprob", dest="allprob", action='store_true',
help="All views are generated as posterior probabilities. This affects the supervised and pretrained "
"embeddings, for which a calibrated classifier is generated, which generates the posteriors", default=False)
parser.add_option("--feat-weight", dest="feat_weight",
help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf')
parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimize hyperparameters", default=False)
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
help="Number of parallel jobs (default is -1, all)", default=-1)
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
default=300)
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
help="Remove common component when computing dot product of word embedding matrices", default=False)
parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
help="Z-score normalize matrices (WCE and MUSE)", default=False)
parser.add_option("-a", "--agg", dest="agg", action='store_true',
help="Set aggregation function of the common Z-space to average (Default: concatenation)", default=False)
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
def get_params():
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
#######################################################################################################################
if __name__ == '__main__': if __name__ == '__main__':
(op, args) = parser.parse_args() (op, args) = parser.parse_args()
dataset = op.dataset
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
dataset = args[0]
assert exists(dataset), 'Unable to find file '+str(dataset) assert exists(dataset), 'Unable to find file '+str(dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed' assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \
l2=op.l2 'empty set of document embeddings is not allowed'
assert (op.gruWCE or op.gruMUSE) and op.gruViewGenerator, 'Initializing Gated Recurrent embedding layer without ' \
'explicit initialization of GRU View Generator'
l2 = op.l2
dataset_file = os.path.basename(dataset) dataset_file = os.path.basename(dataset)
results = PolylingualClassificationResults('../log/' + op.output) results = PolylingualClassificationResults('../log/' + op.output)
allprob='Prob' if op.allprob else '' allprob = 'Prob' if op.allprob else ''
result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \
f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}_zscore={op.zscore}{"_optimC" if op.optimc else ""}'
print(f'{result_id}')
# set zscore range - is slice(0,0) mean will be equal to 0 and std to 1, thus normalization will have no effect # renaming arguments to be printed on log
standardize_range = slice(0,0) method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert,
op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob)
print(f'Method: gFun{method_name}\nDataset: {dataset_name}')
print('-'*50)
exit()
# set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect
standardize_range = slice(0, 0)
if op.zscore: if op.zscore:
standardize_range = None standardize_range = None
# load dataset
data = MultilingualDataset.load(dataset) data = MultilingualDataset.load(dataset)
# data.set_view(languages=['fr', 'it']) # data.set_view(languages=['fr', 'it']) # TODO: DEBUG SETTING
data.show_dimensions() data.show_dimensions()
lXtr, lytr = data.training() lXtr, lytr = data.training()
lXte, lyte = data.test() lXte, lyte = data.test()
@ -108,68 +47,96 @@ if __name__ == '__main__':
# feature weighting (for word embeddings average) # feature weighting (for word embeddings average)
feat_weighting = FeatureWeight(op.feat_weight, agg='mean') feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
# # document embedding modules # document embedding modules aka View Generators
doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat') doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat')
# init View Generators
if op.posteriors: if op.posteriors:
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2)) """
View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means
of a set of SVM.
"""
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
kernel='linear',
C=op.set_c), l2=l2))
if op.supervised: if op.supervised:
"""
View Generator (-W): generates document representation via Word-Class-Embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings.
"""
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif) wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif)
if op.allprob: if op.allprob:
wce = FeatureSet2Posteriors(wce, l2=l2) wce = FeatureSet2Posteriors(wce, requires_tfidf=True, l2=l2)
doc_embedder.append(wce) doc_embedder.append(wce)
if op.pretrained: if op.pretrained:
"""
View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
"""
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif) muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif)
if op.allprob: if op.allprob:
muse = FeatureSet2Posteriors(muse, l2=l2) muse = FeatureSet2Posteriors(muse, requires_tfidf=True, l2=l2)
doc_embedder.append(muse) doc_embedder.append(muse)
if op.gruViewGenerator:
"""
View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Such
document embeddings are then casted into vectors of posterior probabilities via a set of SVM.
NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob
"""
rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
options=op, model_path=op.gru_path)
doc_embedder.append(rnn_embedder)
if op.mbert:
"""
View generator (-B): generates document embedding via mBERT model.
"""
mbert = MBertEmbedder(path_to_model=op.bert_path,
nC=data.num_categories())
if op.allprob:
mbert = FeatureSet2Posteriors(mbert, l2=l2)
doc_embedder.append(mbert)
# metaclassifier # metaclassifier
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=get_params(), standardize_range=standardize_range) meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c),
meta_parameters=get_params(op.optimc), standardize_range=standardize_range)
# ensembling the modules # ensembling the modules
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
print('# Fitting ...') print('\n# Fitting Funnelling Architecture...')
tinit = time.time()
classifier.fit(lXtr, lytr) classifier.fit(lXtr, lytr)
time = time.time()-tinit
print('\n# Evaluating ...') print('\n# Evaluating ...')
l_eval = evaluate_method(classifier, lXte, lyte) l_eval = evaluate_method(classifier, lXte, lyte)
# renaming arguments to be printed on log
_id = ''
_id_conf = [op.posteriors, op.supervised, op.pretrained]
_id_name = ['+P', '+W', '+M']
for i, conf in enumerate(_id_conf):
if conf:
_id += _id_name[i]
_id = _id.lstrip('+')
_id = _id if not op.agg else _id + '_mean'
_id = _id if not op.allprob else _id + '_allprob'
_dataset_path = dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
metrics = [] metrics = []
for lang in lXte.keys(): for lang in lXte.keys():
macrof1, microf1, macrok, microk = l_eval[lang] macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk]) metrics.append([macrof1, microf1, macrok, microk])
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
results.add_row(method='MultiModal', results.add_row(method='MultiModal',
learner='svm', learner='SVM',
optimp=op.optimc, optimp=op.optimc,
sif= op.sif, sif=op.sif,
zscore=op.zscore, zscore=op.zscore,
l2= op.l2, l2=op.l2,
wescaler= op.feat_weight, wescaler=op.feat_weight,
pca=op.max_labels_S, pca=op.max_labels_S,
id=_id, id=method_name,
dataset=dataset_id, dataset=dataset_name,
time='todo', time=time,
lang=lang, lang=lang,
macrof1=macrof1, macrof1=macrof1,
microf1=microf1, microf1=microf1,
macrok=macrok, macrok=macrok,
microk=microk, microk=microk,
notes='') notes='')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))

View File

@ -0,0 +1,49 @@
import os
from dataset_builder import MultilingualDataset
from optparse import OptionParser
from util.file import exists
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
parser = OptionParser(usage="usage: %prog datapath [options]")
(op, args) = parser.parse_args()
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
dataset = args[0]
assert exists(dataset), 'Unable to find file '+str(dataset)
dataset_file = os.path.basename(dataset)
data = MultilingualDataset.load(dataset)
data.set_view(languages=['it'])
data.show_dimensions()
lXtr, lytr = data.training()
lXte, lyte = data.test()
vect_lXtr = dict()
vectorizer = CountVectorizer()
vect_lXtr['it'] = vectorizer.fit_transform(lXtr['it'])
# print(type(vect_lXtr['it']))
corr = vect_lXtr['it'].T.dot(lytr['it'])
# print(corr.shape)
sum_correlated_class = corr.sum(axis=0)
print(len(sum_correlated_class))
print(sum_correlated_class.max())
w2idx = vectorizer.vocabulary_
idx2w = {v:k for k,v in w2idx.items()}
word_tot_corr = corr.sum(axis=1)
print(word_tot_corr.shape)
dict_word_tot_corr = {v:k for k,v in enumerate(word_tot_corr)}
sorted_word_tot_corr = np.sort(word_tot_corr)
sorted_word_tot_corr = sorted_word_tot_corr[len(sorted_word_tot_corr)-200:]
top_idx = [dict_word_tot_corr[k] for k in sorted_word_tot_corr]
print([idx2w[idx] for idx in top_idx])
print([elem for elem in top_idx])
print(corr[8709])
print('Finished...')

View File

@ -8,7 +8,8 @@ from models.helpers import *
class RNNMultilingualClassifier(nn.Module): class RNNMultilingualClassifier(nn.Module):
def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None, def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None,
drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False): drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False,
bert_embeddings=False):
super(RNNMultilingualClassifier, self).__init__() super(RNNMultilingualClassifier, self).__init__()
self.output_size = output_size self.output_size = output_size
@ -16,6 +17,7 @@ class RNNMultilingualClassifier(nn.Module):
self.drop_embedding_range = drop_embedding_range self.drop_embedding_range = drop_embedding_range
self.drop_embedding_prop = drop_embedding_prop self.drop_embedding_prop = drop_embedding_prop
self.post_probabilities = post_probabilities self.post_probabilities = post_probabilities
self.bert_embeddings = bert_embeddings
assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
self.lpretrained_embeddings = nn.ModuleDict() self.lpretrained_embeddings = nn.ModuleDict()
@ -56,19 +58,24 @@ class RNNMultilingualClassifier(nn.Module):
if only_post: if only_post:
self.label = nn.Linear(output_size, output_size) self.label = nn.Linear(output_size, output_size)
elif post_probabilities: elif post_probabilities and not bert_embeddings:
self.label = nn.Linear(ff2+output_size, output_size) self.label = nn.Linear(ff2 + output_size, output_size)
elif bert_embeddings and not post_probabilities:
self.label = nn.Linear(ff2 + 768, output_size)
elif post_probabilities and bert_embeddings:
self.label = nn.Linear(ff2 + output_size + 768, output_size)
else: else:
self.label = nn.Linear(ff2, output_size) self.label = nn.Linear(ff2, output_size)
def forward(self, input, post, bert_embed, lang):
def forward(self, input, post, lang):
if self.only_post: if self.only_post:
doc_embedding = post doc_embedding = post
else: else:
doc_embedding = self.transform(input, lang) doc_embedding = self.transform(input, lang)
if self.post_probabilities: if self.post_probabilities:
doc_embedding = torch.cat([doc_embedding, post], dim=1) doc_embedding = torch.cat([doc_embedding, post], dim=1)
if self.bert_embeddings:
doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1)
logits = self.label(doc_embedding) logits = self.label(doc_embedding)
return logits return logits
@ -83,7 +90,7 @@ class RNNMultilingualClassifier(nn.Module):
# c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) # c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda())
# output, (_, _) = self.lstm(input, (h_0, c_0)) # output, (_, _) = self.lstm(input, (h_0, c_0))
output, _ = self.rnn(input, h_0) output, _ = self.rnn(input, h_0)
output = output[-1,:,:] output = output[-1, :, :]
output = F.relu(self.linear0(output)) output = F.relu(self.linear0(output))
output = self.dropout(F.relu(self.linear1(output))) output = self.dropout(F.relu(self.linear1(output)))
output = self.dropout(F.relu(self.linear2(output))) output = self.dropout(F.relu(self.linear2(output)))
@ -94,3 +101,14 @@ class RNNMultilingualClassifier(nn.Module):
self.lpretrained_embeddings[l].requires_grad = True self.lpretrained_embeddings[l].requires_grad = True
self.lpretrained_embeddings[l].weight.requires_grad = True self.lpretrained_embeddings[l].weight.requires_grad = True
def get_embeddings(self, input, lang):
batch_size = input.shape[0]
input = embed(self, input, lang)
input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
training=self.training)
input = input.permute(1, 0, 2)
h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda())
output, _ = self.rnn(input, h_0)
output = output[-1, :, :]
return output.cpu().detach().numpy()

249
src/models/mBert.py Normal file
View File

@ -0,0 +1,249 @@
from copy import deepcopy
import torch
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig
from sklearn.model_selection import train_test_split
from util.evaluation import *
from time import time
def predict(logits, classification_type='multilabel'):
if classification_type == 'multilabel':
prediction = torch.sigmoid(logits) > 0.5
elif classification_type == 'singlelabel':
prediction = torch.argmax(logits, dim=1).view(-1, 1)
else:
print('unknown classification type')
return prediction.detach().cpu().numpy()
class TrainingDataset(Dataset):
"""
data: dict of lang specific tokenized data
labels: dict of lang specific targets
"""
def __init__(self, data, labels):
self.langs = data.keys()
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
for i, lang in enumerate(self.langs):
_data = data[lang]['input_ids']
_data = np.array(_data)
_labels = labels[lang]
_lang_value = np.full(len(_data), self.lang_ids[lang])
if i == 0:
self.data = _data
self.labels = _labels
self.lang_index = _lang_value
else:
self.data = np.vstack((self.data, _data))
self.labels = np.vstack((self.labels, _labels))
self.lang_index = np.concatenate((self.lang_index, _lang_value))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.data[idx]
y = self.labels[idx]
lang = self.lang_index[idx]
return x, torch.tensor(y, dtype=torch.float), lang
def get_lang_ids(self):
return self.lang_ids
def get_nclasses(self):
if hasattr(self, 'labels'):
return len(self.labels[0])
else:
print('Method called before init!')
class ExtractorDataset(Dataset):
"""
data: dict of lang specific tokenized data
labels: dict of lang specific targets
"""
def __init__(self, data):
self.langs = data.keys()
self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
for i, lang in enumerate(self.langs):
_data = data[lang]['input_ids']
_data = np.array(_data)
_lang_value = np.full(len(_data), self.lang_ids[lang])
if i == 0:
self.data = _data
self.lang_index = _lang_value
else:
self.data = np.vstack((self.data, _data))
self.lang_index = np.concatenate((self.lang_index, _lang_value))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.data[idx]
lang = self.lang_index[idx]
return x, lang
def get_lang_ids(self):
return self.lang_ids
def get_model(n_out):
print('# Initializing model ...')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
return model
def init_optimizer(model, lr, weight_decay=0):
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)],
'weight_decay': weight_decay},
{'params': [p for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)],
'weight_decay': weight_decay}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
return optimizer
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
l_split_va = deepcopy(l_tokenized_tr)
l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
l_split_tr = deepcopy(l_tokenized_tr)
l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
for lang in l_tokenized_tr.keys():
val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[
lang] = \
train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size,
random_state=seed, shuffle=True)
return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
def do_tokenization(l_dataset, max_len=512, verbose=True):
if verbose:
print('# Starting Tokenization ...')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
langs = l_dataset.keys()
l_tokenized = {}
for lang in langs:
l_tokenized[lang] = tokenizer(l_dataset[lang],
truncation=True,
max_length=max_len,
padding='max_length')
return l_tokenized
def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10):
# _dataset_path = opt.dataset.split('/')[-1].split('_')
# dataset_id = _dataset_path[0] + _dataset_path[-1]
dataset_id = 'TODO fix this!'
loss_history = []
model.train()
for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
optim.zero_grad()
out = model(batch.cuda())
logits = out[0]
loss = criterion(logits, target.cuda())
loss.backward()
# clip_gradient(model)
optim.step()
loss_history.append(loss.item())
if idx % log_interval == 0:
interval_loss = np.mean(loss_history[log_interval:])
print(
f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
return mean_loss
def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix):
print('# Validating model ...')
loss_history = []
model.eval()
langs = lang_ids.keys()
id_2_lang = {v: k for k, v in lang_ids.items()}
predictions = {l: [] for l in langs}
yte_stacked = {l: [] for l in langs}
for batch, target, lang_idx in test_dataloader:
out = model(batch.cuda())
logits = out[0]
loss = criterion(logits, target.cuda()).item()
prediction = predict(logits)
loss_history.append(loss)
# Assigning prediction to dict in predictions and yte_stacked according to lang_idx
for i, pred in enumerate(prediction):
lang_pred = id_2_lang[lang_idx.numpy()[i]]
predictions[lang_pred].append(pred)
yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
ly = {l: np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l: np.vstack(predictions[l]) for l in langs}
l_eval = evaluate(ly, ly_)
metrics = []
for lang in langs:
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix == 'te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
mean_loss = np.mean(loss_history)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
return Mf1
def feature_extractor(data, lang_ids, model):
print('# Feature Extractor Mode...')
"""
Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
the output of each layer) of shape (batch_size, sequence_length, hidden_size)
"""
all_batch_embeddings = {}
id2lang = {v: k for k, v in lang_ids.items()}
with torch.no_grad():
for batch, lang_idx in data:
# for batch, target, lang_idx in data:
out = model(batch.cuda())
last_hidden_state = out[1][-1]
batch_embeddings = last_hidden_state[:, 0, :]
for i, l_idx in enumerate(lang_idx.numpy()):
if id2lang[l_idx] not in all_batch_embeddings.keys():
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
else:
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
batch_embeddings[i].detach().cpu().numpy()))
return all_batch_embeddings, id2lang

16
src/run_fun_bert_jrc.sh Normal file
View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
#logfile=../log/log_FunBert_jrc.csv
#
#runs='0 1 2 3 4'
#for run in $runs
#do
# dataset=$dataset_path$run.pickle
# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile #--tunable
#done
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
logfile=../log/log_FunBert_fulljrc_static.csv
python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile

16
src/run_fun_bert_rcv.sh Normal file
View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
#logfile=../log/log_FunBert_rcv_static.csv
#
#runs='0 1 2 3 4'
#for run in $runs
#do
# dataset=$dataset_path$run.pickle
# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile
#done
dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
logfile=../log/log_FunBert_fullrcv_static.csv
python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile

15
src/run_mbert_jrc.sh Normal file
View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
#logfile=../log/log_mBert_jrc_NEW.csv
#
#runs='0 1 2 3 4'
#for run in $runs
#do
# dataset=$dataset_path$run.pickle
# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
#done
logfile=../log/log_mBert_fulljrc.csv
dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50

View File

@ -1,11 +1,15 @@
#!/usr/bin/env bash #!/usr/bin/env bash
dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run #dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
logfile=../log/log_Mbert_rcv.csv #logfile=../log/log_mBert_rcv_NEW.csv
#
#runs='0 1 2 3 4'
#for run in $runs
#do
# dataset=$dataset_path$run.pickle
# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
#done
runs='1 2 3 4 5 6 7 8 9' logfile=../log/log_mBert_fullrcv.csv
for run in $runs dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
do python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=30 --patience 3
dataset=$dataset_path$run.pickle
python new_mbert.py --dataset $dataset --log-file $logfile --nepochs=5 --weight_decay=0.01 --lr=1e-5
done

View File

@ -1,15 +1,14 @@
import subprocess
import warnings import warnings
import time
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from embeddings.supervised import get_supervised_embeddings from embeddings.supervised import get_supervised_embeddings
from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual # from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual
warnings.filterwarnings("ignore", category=DeprecationWarning)
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
import torch import torch
from scipy.sparse import vstack, issparse warnings.filterwarnings("ignore", category=DeprecationWarning)
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
@ -161,12 +160,13 @@ class Index:
def none_dict(langs): def none_dict(langs):
return {l:None for l in langs} return {l:None for l in langs}
class MultilingualIndex: class MultilingualIndex:
def __init__(self): #, add_language_trace=False): def __init__(self): #, add_language_trace=False):
self.l_index = {} self.l_index = {}
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
# self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000) # self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000)
# self.add_language_trace=add_language_trace # self.add_language_trace=add_language_trace}
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary): def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
self.langs = sorted(l_devel_raw.keys()) self.langs = sorted(l_devel_raw.keys())
@ -184,6 +184,8 @@ class MultilingualIndex:
for l,index in self.l_index.items(): for l,index in self.l_index.items():
index.train_val_split(val_prop, max_val, seed=seed) index.train_val_split(val_prop, max_val, seed=seed)
def embedding_matrices(self, lpretrained, supervised): def embedding_matrices(self, lpretrained, supervised):
lXtr = self.get_lXtr() if supervised else none_dict(self.langs) lXtr = self.get_lXtr() if supervised else none_dict(self.langs)
lYtr = self.l_train_target() if supervised else none_dict(self.langs) lYtr = self.l_train_target() if supervised else none_dict(self.langs)
@ -191,52 +193,133 @@ class MultilingualIndex:
index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l]) index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
self.sup_range = index.wce_range self.sup_range = index.wce_range
# experimental... does it make sense to keep track of the language? i.e., to inform the network from which # TODO circular import with transformers --> when generating posterior prob, we import PosteriorProbabilitiesEmbedder which is defined in transformers
# language does the data came from... # def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
# if self.add_language_trace and pretrained_embeddings is not None: # # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
# print('adding language trace') # timeit = time.time()
# lang_trace = torch.zeros(size=(vocabsize, len(self.langs))) # lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
# lang_trace[:,i]=1 # lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
# pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1) # if not stored_post:
# for l in self.langs:
# n_elements = lXtr[l].shape[0]
# if n_elements > max_training_docs_by_lang:
# choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
# lXtr[l] = lXtr[l][choice]
# lYtr[l] = lYtr[l][choice]
#
# # train the posterior probabilities embedder
# print('[posteriors] training a calibrated SVM')
# learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
# prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
# prob_embedder.fit(lXtr, lYtr)
#
# # transforms the training, validation, and test sets into posterior probabilities
# print('[posteriors] generating posterior probabilities')
# lPtr = prob_embedder.transform(self.get_lXtr())
# lPva = prob_embedder.transform(self.get_lXva())
# lPte = prob_embedder.transform(self.get_lXte())
# # NB: Check splits indices !
# if store_posteriors:
# import pickle
# with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
# pickle.dump([lPtr, lPva, lPte], outfile)
# print(f'Successfully dumped posteriors!')
# else:
# import pickle
# with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
# lPtr, lPva, lPte = pickle.load(infile)
# print(f'Successfully loaded stored posteriors!')
# print(f'[posteriors] done in {time.time() - timeit}')
# return lPtr, lPva, lPte
def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False):
show_gpu('GPU memory before initializing mBert model:')
# TODO: load dumped embeddings?
from main_mbert_extractor import do_tokenization, ExtractorDataset, DataLoader
from transformers import BertConfig, BertForSequenceClassification
print('[mBERT] generating mBERT doc embeddings')
lXtr_raw = self.get_raw_lXtr()
lXva_raw = self.get_raw_lXva()
lXte_raw = self.get_raw_lXte()
print('# Tokenizing datasets')
l_tokenized_tr = do_tokenization(lXtr_raw, max_len=max_len, verbose=False)
tr_dataset = ExtractorDataset(l_tokenized_tr)
tr_lang_ids = tr_dataset.lang_ids
tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=False)
l_tokenized_va = do_tokenization(lXva_raw, max_len=max_len, verbose=False)
va_dataset = ExtractorDataset(l_tokenized_va)
va_lang_ids = va_dataset.lang_ids
va_dataloader = DataLoader(va_dataset, batch_size=batch_size, shuffle=False)
l_tokenized_te = do_tokenization(lXte_raw, max_len=max_len, verbose=False)
te_dataset = ExtractorDataset(l_tokenized_te)
te_lang_ids = te_dataset.lang_ids
te_dataloader = DataLoader(te_dataset, batch_size=batch_size, shuffle=False)
num_labels = self.l_index[self.langs[0]].val_target.shape[1]
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
num_labels=num_labels)
model = BertForSequenceClassification.from_pretrained(bert_path,
config=config).cuda()
print('# Extracting document embeddings')
tr_bert_embeddings, id2lang_tr = self.do_bert_embeddings(model, tr_dataloader, tr_lang_ids, verbose=False)
va_bert_embeddings, id2lang_va = self.do_bert_embeddings(model, va_dataloader, va_lang_ids, verbose=False)
te_bert_embeddings, id2lang_te = self.do_bert_embeddings(model, te_dataloader, te_lang_ids, verbose=False)
show_gpu('GPU memory before after mBert model:')
# Freeing GPU's memory
import gc
del model, tr_dataloader, va_dataloader, te_dataloader
gc.collect()
torch.cuda.empty_cache()
show_gpu('GPU memory after clearing cache:')
return tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings
def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False): @staticmethod
# choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs def do_bert_embeddings(model, data, lang_ids, verbose=True):
timeit = time.time() if verbose:
lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()} print('# Feature Extractor Mode...')
lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()} all_batch_embeddings = {}
if not stored_post: id2lang = {v: k for k, v in lang_ids.items()}
for l in self.langs: with torch.no_grad():
n_elements = lXtr[l].shape[0] for batch, lang_idx in data:
if n_elements > max_training_docs_by_lang: out = model(batch.cuda())
choice = np.random.permutation(n_elements)[:max_training_docs_by_lang] last_hidden_state = out[1][-1]
lXtr[l] = lXtr[l][choice] batch_embeddings = last_hidden_state[:, 0, :]
lYtr[l] = lYtr[l][choice] for i, l_idx in enumerate(lang_idx.numpy()):
if id2lang[l_idx] not in all_batch_embeddings.keys():
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
else:
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
batch_embeddings[i].detach().cpu().numpy()))
# train the posterior probabilities embedder return all_batch_embeddings, id2lang
print('[posteriors] training a calibrated SVM')
learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
prob_embedder.fit(lXtr, lYtr)
# transforms the training, validation, and test sets into posterior probabilities def get_raw_lXtr(self):
print('[posteriors] generating posterior probabilities') lXtr_raw = {k:[] for k in self.langs}
lPtr = prob_embedder.transform(self.get_lXtr()) lYtr_raw = {k: [] for k in self.langs}
lPva = prob_embedder.transform(self.get_lXva()) for lang in self.langs:
lPte = prob_embedder.transform(self.get_lXte()) lXtr_raw[lang] = self.l_index[lang].train_raw
# NB: Check splits indices ! lYtr_raw[lang] = self.l_index[lang].train_raw
if store_posteriors: return lXtr_raw
import pickle
with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile: def get_raw_lXva(self):
pickle.dump([lPtr, lPva, lPte], outfile) lXva_raw = {k: [] for k in self.langs}
print(f'Successfully dumped posteriors!') for lang in self.langs:
else: lXva_raw[lang] = self.l_index[lang].val_raw
import pickle
with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile: return lXva_raw
lPtr, lPva, lPte = pickle.load(infile)
print(f'Successfully loaded stored posteriors!') def get_raw_lXte(self):
print(f'[posteriors] done in {time.time() - timeit}') lXte_raw = {k: [] for k in self.langs}
return lPtr, lPva, lPte for lang in self.langs:
lXte_raw[lang] = self.l_index[lang].test_raw
return lXte_raw
def get_lXtr(self): def get_lXtr(self):
if not hasattr(self, 'lXtr'): if not hasattr(self, 'lXtr'):
@ -277,6 +360,12 @@ class MultilingualIndex:
def l_test_index(self): def l_test_index(self):
return {l: index.test_index for l, index in self.l_index.items()} return {l: index.test_index for l, index in self.l_index.items()}
def l_devel_index(self):
return {l: index.devel_index for l, index in self.l_index.items()}
def l_devel_target(self):
return {l: index.devel_target for l, index in self.l_index.items()}
def l_train(self): def l_train(self):
return self.l_train_index(), self.l_train_target() return self.l_train_index(), self.l_train_target()
@ -284,7 +373,6 @@ class MultilingualIndex:
return self.l_val_index(), self.l_val_target() return self.l_val_index(), self.l_val_target()
class Batch: class Batch:
def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500): def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
self.batchsize = batchsize self.batchsize = batchsize
@ -297,7 +385,7 @@ class Batch:
def init_offset(self): def init_offset(self):
self.offset = {lang: 0 for lang in self.languages} self.offset = {lang: 0 for lang in self.languages}
def batchify(self, l_index, l_post, llabels): def batchify(self, l_index, l_post, l_bert, llabels): # TODO: add bert embedding here...
langs = self.languages langs = self.languages
l_num_samples = {l:len(l_index[l]) for l in langs} l_num_samples = {l:len(l_index[l]) for l in langs}
@ -322,6 +410,10 @@ class Batch:
if l_post is not None: if l_post is not None:
post = torch.FloatTensor(l_post[lang][batch_slice]).cuda() post = torch.FloatTensor(l_post[lang][batch_slice]).cuda()
bert_emb = None
if l_bert is not None:
bert_emb = torch.FloatTensor(l_bert[lang][batch_slice]).cuda()
batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length) batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
batch = torch.LongTensor(batch).cuda() batch = torch.LongTensor(batch).cuda()
@ -329,7 +421,7 @@ class Batch:
self.offset[lang] = limit self.offset[lang] = limit
yield batch, post, target, lang yield batch, post, bert_emb, target, lang
def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500): def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500):
@ -384,7 +476,81 @@ def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad) return sum(p.numel() for p in model.parameters() if p.requires_grad)
def show_gpu(msg):
"""
ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4
"""
def query(field):
return (subprocess.check_output(
['nvidia-smi', f'--query-gpu={field}',
'--format=csv,nounits,noheader'],
encoding='utf-8'))
def to_int(result):
return int(result.strip().split('\n')[0])
used = to_int(query('memory.used'))
total = to_int(query('memory.total'))
pct = used / total
print('\n' + msg, f'{100 * pct:2.1f}% ({used} out of {total})')
class TfidfVectorizerMultilingual:
def __init__(self, **kwargs):
self.kwargs = kwargs
def fit(self, lX, ly=None):
self.langs = sorted(lX.keys())
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
return self
def transform(self, lX):
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
def fit_transform(self, lX, ly=None):
return self.fit(lX, ly).transform(lX)
def vocabulary(self, l=None):
if l is None:
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
else:
return self.vectorizer[l].vocabulary_
def get_analyzer(self, l=None):
if l is None:
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
else:
return self.vectorizer[l].build_analyzer()
def get_learner(calibrate=False, kernel='linear', C=1):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False)
def get_params(optimc=False):
if not optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru,
gruMUSE, gruWCE, agg, allprob):
_id = '-'
_id_conf = [posteriors, supervised, pretrained, mbert, gru]
_id_name = ['X', 'W', 'M', 'B', 'G']
for i, conf in enumerate(_id_conf):
if conf:
_id += _id_name[i]
_id = _id if not gruMUSE else _id + '_muse'
_id = _id if not gruWCE else _id + '_wce'
_id = _id if not agg else _id + '_mean'
_id = _id if not allprob else _id + '_allprob'
_dataset_path = dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
return _id, dataset_id

View File

@ -1,12 +1,13 @@
#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py #adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
import torch import torch
from transformers import BertForSequenceClassification
from time import time from time import time
from util.file import create_if_not_exist from util.file import create_if_not_exist
import warnings import warnings
class EarlyStopping: class EarlyStopping:
def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt'): def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt', is_bert=False):
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters # set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
self.patience_limit = patience self.patience_limit = patience
self.patience = patience self.patience = patience
@ -18,6 +19,7 @@ class EarlyStopping:
self.model = model self.model = model
self.optimizer = optimizer self.optimizer = optimizer
self.STOP = False self.STOP = False
self.is_bert = is_bert
def __call__(self, watch_score, epoch): def __call__(self, watch_score, epoch):
@ -30,12 +32,17 @@ class EarlyStopping:
self.stop_time = time() self.stop_time = time()
if self.checkpoint: if self.checkpoint:
self.print(f'[early-stop] improved, saving model in {self.checkpoint}') self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
with warnings.catch_warnings(): if self.is_bert:
warnings.simplefilter("ignore") print(f'Serializing Huggingface model...')
torch.save(self.model, self.checkpoint) create_if_not_exist(self.checkpoint)
# with open(self.checkpoint) self.model.save_pretrained(self.checkpoint)
# torch.save({'state_dict': self.model.state_dict(), else:
# 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint) with warnings.catch_warnings():
warnings.simplefilter("ignore")
torch.save(self.model, self.checkpoint)
# with open(self.checkpoint)
# torch.save({'state_dict': self.model.state_dict(),
# 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint)
else: else:
self.print(f'[early-stop] improved') self.print(f'[early-stop] improved')
self.patience = self.patience_limit self.patience = self.patience_limit
@ -54,7 +61,10 @@ class EarlyStopping:
def restore_checkpoint(self): def restore_checkpoint(self):
print(f'restoring best model from epoch {self.best_epoch}...') print(f'restoring best model from epoch {self.best_epoch}...')
return torch.load(self.checkpoint) if self.is_bert:
return BertForSequenceClassification.from_pretrained(self.checkpoint)
else:
return torch.load(self.checkpoint)
def print(self, msg): def print(self, msg):
if self.verbose: if self.verbose:

View File

@ -5,18 +5,21 @@ from sklearn.metrics import f1_score
import numpy as np import numpy as np
import time import time
def evaluation_metrics(y, y_): def evaluation_metrics(y, y_):
if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_) return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
def soft_evaluation_metrics(y, y_): def soft_evaluation_metrics(y, y_):
if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_) return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_)
def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1): def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
print('evaluation (n_jobs={})'.format(n_jobs)) print('evaluation (n_jobs={})'.format(n_jobs))
if n_jobs == 1: if n_jobs == 1:
@ -26,6 +29,7 @@ def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs) evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs)
return {lang: evals[i] for i, lang in enumerate(langs)} return {lang: evals[i] for i, lang in enumerate(langs)}
def average_results(l_eval, show=True): def average_results(l_eval, show=True):
metrics = [] metrics = []
for lang in l_eval.keys(): for lang in l_eval.keys():
@ -60,6 +64,7 @@ def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, retu
else: else:
return eval_ return eval_
def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False): def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False):
print('prediction for test in a single language') print('prediction for test in a single language')
if predictor is None: if predictor is None:
@ -72,6 +77,7 @@ def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=Fa
ly_ = predictor({lang:X}) ly_ = predictor({lang:X})
return metrics(y, ly_[lang]) return metrics(y, ly_[lang])
def get_binary_counters(polylingual_method, lX, ly, predictor=None): def get_binary_counters(polylingual_method, lX, ly, predictor=None):
print('prediction for test') print('prediction for test')
assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate' assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
@ -87,6 +93,7 @@ def get_binary_counters(polylingual_method, lX, ly, predictor=None):
evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs) evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs)
return {lang: evals[i] for i, lang in enumerate(langs)} return {lang: evals[i] for i, lang in enumerate(langs)}
def binary_counters(y, y_): def binary_counters(y, y_):
y = np.reshape(y, (-1)) y = np.reshape(y, (-1))
assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected' assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected'

View File

@ -0,0 +1,91 @@
from optparse import OptionParser
parser = OptionParser(usage="usage: %prog datapath [options]")
parser.add_option("-d", dest='dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset')
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='../log/multiModal_log.csv')
parser.add_option("-X", "--posteriors", dest="posteriors", action='store_true',
help="Add posterior probabilities to the document embedding representation", default=False)
parser.add_option("-W", "--supervised", dest="supervised", action='store_true',
help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
parser.add_option("-M", "--pretrained", dest="pretrained", action='store_true',
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
parser.add_option("-B", "--mbert", dest="mbert", action='store_true',
help="Add multilingual Bert (mBert) document embedding representation", default=False)
parser.add_option('-G', dest='gruViewGenerator', action='store_true',
help="Add document embedding generated via recurrent net (GRU)", default=False)
parser.add_option("--l2", dest="l2", action='store_true',
help="Activates l2 normalization as a post-processing for the document embedding views",
default=False)
parser.add_option("--allprob", dest="allprob", action='store_true',
help="All views are generated as posterior probabilities. This affects the supervised and pretrained"
"embeddings, for which a calibrated classifier is generated, which generates the posteriors",
default=False)
parser.add_option("--feat-weight", dest="feat_weight",
help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf')
parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
parser.add_option("-s", "--set_c", dest="set_c", type=float,
help="Set the C parameter", default=1)
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimize hyperparameters", default=False)
parser.add_option("-j", "--n_jobs", dest="n_jobs", type=int,
help="Number of parallel jobs (default is -1, all)", default=-1)
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
default=300)
parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
help="Remove common component when computing dot product of word embedding matrices", default=False)
parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
help="Z-score normalize matrices (WCE and MUSE)", default=False)
parser.add_option("-a", "--agg", dest="agg", action='store_true',
help="Set aggregation function of the common Z-space to average (Default: concatenation)",
default=False)
# ------------------------------------------------------------------------------------
parser.add_option('--hidden', type=int, default=512, metavar='int',
help='hidden lstm size (default: 512)')
parser.add_option('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]',
help='dropout probability for the supervised matrix (default: 0.5)')
parser.add_option('--tunable', action='store_true', default=False,
help='pretrained embeddings are tunable from the beginning (default False, i.e., static)')
parser.add_option('--logfile_gru', dest='logfile_gru', default='../log/log_gru_viewgenerator.csv')
parser.add_option('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
parser.add_option('--force', action='store_true', default=False,
help='do not check if this experiment has already been run')
parser.add_option('--gruMuse', dest='gruMUSE', action='store_true', default=False,
help='Deploy MUSE embedding as embedding layer of the GRU View Generator')
parser.add_option('--gruWce', dest='gruWCE', action='store_true', default=False,
help='Deploy WCE embedding as embedding layer of the GRU View Generator')
parser.add_option('--gru-path', dest='gru_path', default=None,
help='Set the path to a pretrained GRU model (aka, -G view generator)')
parser.add_option('--bert-path', dest='bert_path', default=None,
help='Set the path to a pretrained mBERT model (aka, -B view generator)')

View File

@ -9,7 +9,7 @@ class StandardizeTransformer:
self.range = range self.range = range
def fit(self, X): def fit(self, X):
print('fitting Standardizer') print('fitting Standardizer...')
std=np.std(X, axis=self.axis, ddof=1) std=np.std(X, axis=self.axis, ddof=1)
self.std = np.clip(std, 1e-5, None) self.std = np.clip(std, 1e-5, None)
self.mean = np.mean(X, axis=self.axis) self.mean = np.mean(X, axis=self.axis)
@ -21,7 +21,6 @@ class StandardizeTransformer:
self.std = ones self.std = ones
self.mean = zeros self.mean = zeros
self.yetfit=True self.yetfit=True
print('done\n')
return self return self
def transform(self, X): def transform(self, X):