845 lines
34 KiB
Python
845 lines
34 KiB
Python
from torch.optim.lr_scheduler import StepLR
|
|
from torch.utils.data import DataLoader
|
|
from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain
|
|
from embeddings.embeddings import FastTextMUSE
|
|
from embeddings.supervised import supervised_embeddings_tfidf, zscores
|
|
from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling
|
|
from sklearn.decomposition import PCA
|
|
from scipy.sparse import hstack
|
|
from util_transformers.StandardizeTransformer import StandardizeTransformer
|
|
from util.SIF_embed import remove_pc
|
|
from sklearn.preprocessing import normalize
|
|
from scipy.sparse import csr_matrix
|
|
from models.mBert import *
|
|
from models.lstm_class import *
|
|
from util.csv_log import CSVLog
|
|
from util.file import get_file_name
|
|
from util.early_stop import EarlyStopping
|
|
from util.common import *
|
|
import time
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
# Data Processing
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
class FeatureWeight:
|
|
|
|
def __init__(self, weight='tfidf', agg='mean'):
|
|
assert weight in ['tfidf', 'pmi', 'ig'] or callable(
|
|
weight), 'weight should either be "tfidf" or a callable function'
|
|
assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"'
|
|
self.weight = weight
|
|
self.agg = agg
|
|
self.fitted = False
|
|
if weight == 'pmi':
|
|
self.weight = pointwise_mutual_information
|
|
elif weight == 'ig':
|
|
self.weight = information_gain
|
|
|
|
def fit(self, lX, ly):
|
|
if not self.fitted:
|
|
if self.weight == 'tfidf':
|
|
self.lF = {l: np.ones(X.shape[1]) for l, X in lX.items()}
|
|
else:
|
|
self.lF = {}
|
|
for l in lX.keys():
|
|
X, y = lX[l], ly[l]
|
|
|
|
print(f'getting supervised cell-matrix lang {l}')
|
|
tsr_matrix = get_tsr_matrix(get_supervised_matrix(X, y), tsr_score_funtion=self.weight)
|
|
if self.agg == 'max':
|
|
F = tsr_matrix.max(axis=0)
|
|
elif self.agg == 'mean':
|
|
F = tsr_matrix.mean(axis=0)
|
|
self.lF[l] = F
|
|
|
|
self.fitted = True
|
|
return self
|
|
|
|
def transform(self, lX):
|
|
return {lang: csr_matrix.multiply(lX[lang], self.lF[lang]) for lang in lX.keys()}
|
|
|
|
def fit_transform(self, lX, ly):
|
|
return self.fit(lX, ly).transform(lX)
|
|
|
|
# ------------------------------------------------------------------
|
|
# View Generators (aka first-tier learners)
|
|
# ------------------------------------------------------------------
|
|
|
|
|
|
class PosteriorProbabilitiesEmbedder:
|
|
|
|
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1):
|
|
self.fist_tier_learner = first_tier_learner
|
|
self.fist_tier_parameters = first_tier_parameters
|
|
self.l2 = l2
|
|
self.n_jobs = n_jobs
|
|
self.doc_projector = NaivePolylingualClassifier(
|
|
self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
|
|
)
|
|
self.requires_tfidf = True
|
|
|
|
def fit(self, lX, lY, lV=None, called_by_viewgen=False):
|
|
if not called_by_viewgen:
|
|
# Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
|
|
print('### Posterior Probabilities View Generator (X)')
|
|
print('fitting the projectors... {}'.format(lX.keys()))
|
|
self.doc_projector.fit(lX, lY)
|
|
return self
|
|
|
|
def transform(self, lX):
|
|
lZ = self.predict_proba(lX)
|
|
lZ = _normalize(lZ, self.l2)
|
|
return lZ
|
|
|
|
def fit_transform(self, lX, ly=None, lV=None):
|
|
return self.fit(lX, ly).transform(lX)
|
|
|
|
def best_params(self):
|
|
return self.doc_projector.best_params()
|
|
|
|
def predict(self, lX, ly=None):
|
|
return self.doc_projector.predict(lX)
|
|
|
|
def predict_proba(self, lX, ly=None):
|
|
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents')
|
|
return self.doc_projector.predict_proba(lX)
|
|
|
|
def _get_output_dim(self):
|
|
return len(self.doc_projector.model['da'].model.classes_)
|
|
|
|
|
|
class MuseEmbedder:
|
|
|
|
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False):
|
|
self.path = path
|
|
self.lV = lV
|
|
self.l2 = l2
|
|
self.n_jobs = n_jobs
|
|
self.featureweight = featureweight
|
|
self.sif = sif
|
|
self.requires_tfidf = True
|
|
|
|
def fit(self, lX, ly, lV=None):
|
|
assert lV is not None or self.lV is not None, 'lV not specified'
|
|
print('### MUSE View Generator (M)')
|
|
print(f'Loading fastText pretrained vectors for languages {list(lX.keys())}...')
|
|
self.langs = sorted(lX.keys())
|
|
self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
|
|
lWordList = {l: self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
|
|
self.MUSE = {l: Muse.extract(lWordList[l]).numpy() for l, Muse in self.MUSE.items()}
|
|
self.featureweight.fit(lX, ly)
|
|
return self
|
|
|
|
def transform(self, lX):
|
|
MUSE = self.MUSE
|
|
lX = self.featureweight.transform(lX)
|
|
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
|
delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs
|
|
)
|
|
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
|
|
lMuse = _normalize(lMuse, self.l2)
|
|
return lMuse
|
|
|
|
def fit_transform(self, lX, ly, lV):
|
|
return self.fit(lX, ly, lV).transform(lX)
|
|
|
|
def _get_wordlist_from_word2index(self, word2index):
|
|
return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0]
|
|
|
|
def _get_output_dim(self):
|
|
return self.MUSE['da'].shape[1]
|
|
|
|
|
|
class WordClassEmbedder:
|
|
|
|
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False):
|
|
self.n_jobs = n_jobs
|
|
self.l2 = l2
|
|
self.max_label_space = max_label_space
|
|
self.featureweight = featureweight
|
|
self.sif = sif
|
|
self.requires_tfidf = True
|
|
|
|
def fit(self, lX, ly, lV=None):
|
|
print('### WCE View Generator (M)')
|
|
print('Computing supervised embeddings...')
|
|
self.langs = sorted(lX.keys())
|
|
WCE = Parallel(n_jobs=self.n_jobs)(
|
|
delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs
|
|
)
|
|
self.lWCE = {l: WCE[i] for i, l in enumerate(self.langs)}
|
|
self.featureweight.fit(lX, ly)
|
|
return self
|
|
|
|
def transform(self, lX):
|
|
lWCE = self.lWCE
|
|
lX = self.featureweight.transform(lX)
|
|
XdotWCE = Parallel(n_jobs=self.n_jobs)(
|
|
delayed(XdotM)(lX[lang], lWCE[lang], self.sif) for lang in self.langs
|
|
)
|
|
lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
|
|
lwce = _normalize(lwce, self.l2)
|
|
return lwce
|
|
|
|
def fit_transform(self, lX, ly, lV=None):
|
|
return self.fit(lX, ly).transform(lX)
|
|
|
|
def _get_output_dim(self):
|
|
return 73 # TODO !
|
|
|
|
|
|
class MBertEmbedder:
|
|
|
|
def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None,
|
|
nC=None):
|
|
self.doc_embed_path = doc_embed_path
|
|
self.patience = patience
|
|
self.checkpoint_dir = checkpoint_dir
|
|
self.fitted = False
|
|
self.requires_tfidf = False
|
|
if path_to_model is None and nC is not None:
|
|
self.model = None
|
|
else:
|
|
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
|
|
num_labels=nC)
|
|
self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda()
|
|
self.fitted = True
|
|
|
|
def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1):
|
|
print('### mBERT View Generator (B)')
|
|
if self.fitted is True:
|
|
print('Bert model already fitted!')
|
|
return self
|
|
|
|
print('Fine-tune mBert on the given dataset.')
|
|
l_tokenized_tr = do_tokenization(lX, max_len=512)
|
|
l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly,
|
|
val_prop=0.2, max_val=2000,
|
|
seed=seed) # TODO: seed
|
|
|
|
tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
|
|
va_dataset = TrainingDataset(l_split_va, l_split_val_target)
|
|
tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
|
|
va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
|
|
|
|
nC = tr_dataset.get_nclasses()
|
|
model = get_model(nC)
|
|
model = model.cuda()
|
|
criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
|
optim = init_optimizer(model, lr=lr, weight_decay=0.01)
|
|
lr_scheduler = StepLR(optim, step_size=25, gamma=0.1)
|
|
early_stop = EarlyStopping(model, optimizer=optim, patience=self.patience,
|
|
checkpoint=self.checkpoint_dir,
|
|
is_bert=True)
|
|
|
|
# Training loop
|
|
logfile = '../log/log_mBert_extractor.csv'
|
|
method_name = 'mBert_feature_extractor'
|
|
|
|
tinit = time()
|
|
lang_ids = va_dataset.lang_ids
|
|
for epoch in range(1, nepochs + 1):
|
|
print('# Start Training ...')
|
|
train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile)
|
|
lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
|
|
|
|
# Validation
|
|
macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
|
|
early_stop(macrof1, epoch)
|
|
|
|
if early_stop.STOP:
|
|
print('[early-stop] STOP')
|
|
break
|
|
|
|
model = early_stop.restore_checkpoint()
|
|
self.model = model.cuda()
|
|
|
|
if val_epochs > 0:
|
|
print(f'running last {val_epochs} training epochs on the validation set')
|
|
for val_epoch in range(1, val_epochs + 1):
|
|
train(self.model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile)
|
|
|
|
self.fitted = True
|
|
return self
|
|
|
|
def transform(self, lX):
|
|
assert self.fitted is True, 'Calling transform without any initialized model! - call init first or on init' \
|
|
'pass the "path_to_model" arg.'
|
|
print('Obtaining document embeddings from pretrained mBert ')
|
|
l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
|
|
feat_dataset = ExtractorDataset(l_tokenized_X)
|
|
feat_lang_ids = feat_dataset.lang_ids
|
|
dataloader = DataLoader(feat_dataset, batch_size=64)
|
|
all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
|
|
return all_batch_embeddings
|
|
|
|
def fit_transform(self, lX, ly, lV=None):
|
|
return self.fit(lX, ly).transform(lX)
|
|
|
|
|
|
class RecurrentEmbedder:
|
|
|
|
def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3,
|
|
we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10,
|
|
test_each=0, checkpoint_dir='../checkpoint', model_path=None):
|
|
self.pretrained = pretrained
|
|
self.supervised = supervised
|
|
self.concat = concat
|
|
self.requires_tfidf = False
|
|
self.multilingual_dataset = multilingual_dataset
|
|
self.model = None
|
|
self.we_path = we_path
|
|
self.langs = multilingual_dataset.langs()
|
|
self.hidden_size = hidden_size
|
|
self.sup_drop = sup_drop
|
|
self.posteriors = posteriors
|
|
self.patience = patience
|
|
self.checkpoint_dir = checkpoint_dir
|
|
self.test_each = test_each
|
|
self.options = options
|
|
self.seed = options.seed
|
|
self.is_trained = False
|
|
|
|
## INIT MODEL for training
|
|
self.lXtr, self.lytr = self.multilingual_dataset.training(target_as_csr=True)
|
|
self.lXte, self.lyte = self.multilingual_dataset.test(target_as_csr=True)
|
|
self.nC = self.lyte[self.langs[0]].shape[1]
|
|
lpretrained, lpretrained_vocabulary = self._load_pretrained_embeddings(self.we_path, self.langs)
|
|
self.multilingual_index = MultilingualIndex()
|
|
self.multilingual_index.index(self.lXtr, self.lytr, self.lXte, lpretrained_vocabulary)
|
|
self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed)
|
|
self.multilingual_index.embedding_matrices(lpretrained, self.supervised)
|
|
|
|
if model_path is not None:
|
|
self.is_trained = True
|
|
self.model = torch.load(model_path)
|
|
else:
|
|
self.model = self._init_Net()
|
|
|
|
self.optim = init_optimizer(self.model, lr=lr)
|
|
self.criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
|
self.lr_scheduler = StepLR(self.optim, step_size=25, gamma=0.5)
|
|
self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience,
|
|
checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}')
|
|
# Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities
|
|
self.posteriorEmbedder = MetaClassifier(
|
|
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs)
|
|
|
|
def fit(self, lX, ly, lV=None, batch_size=64, nepochs=2, val_epochs=1):
|
|
print('### Gated Recurrent Unit View Generator (G)')
|
|
# could be better to init model here at first .fit() call!
|
|
if self.model is None:
|
|
print('TODO: Init model!')
|
|
if not self.is_trained:
|
|
# Batchify input
|
|
self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed)
|
|
l_train_index, l_train_target = self.multilingual_index.l_train()
|
|
l_val_index, l_val_target = self.multilingual_index.l_val()
|
|
l_test_index = self.multilingual_index.l_test_index()
|
|
batcher_train = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
|
|
lpad=self.multilingual_index.l_pad())
|
|
batcher_eval = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
|
|
lpad=self.multilingual_index.l_pad())
|
|
|
|
# Train loop
|
|
print('Start training')
|
|
method_name = 'gru_view_generator'
|
|
logfile = init_logfile_nn(method_name, self.options)
|
|
tinit = time.time()
|
|
for epoch in range(1, nepochs + 1):
|
|
train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target,
|
|
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
|
|
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
|
|
ltrain_bert=None)
|
|
self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
|
|
|
|
# validation step
|
|
macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch,
|
|
logfile, self.criterion, 'va')
|
|
|
|
self.early_stop(macrof1, epoch)
|
|
if self.test_each > 0:
|
|
test_gru(self.model, batcher_eval, l_test_index, None, None, self.lyte, tinit, epoch,
|
|
logfile, self.criterion, 'te')
|
|
|
|
if self.early_stop.STOP:
|
|
print('[early-stop] STOP')
|
|
print('Restoring best model...')
|
|
break
|
|
|
|
self.model = self.early_stop.restore_checkpoint()
|
|
print(f'running last {val_epochs} training epochs on the validation set')
|
|
for val_epoch in range(1, val_epochs+1):
|
|
batcher_train.init_offset()
|
|
train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target,
|
|
tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
|
|
epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
|
|
ltrain_bert=None)
|
|
self.is_trained = True
|
|
|
|
# Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities
|
|
lX = self._get_doc_embeddings(lX)
|
|
# Fit a ''multi-lingual'' SVM on the generated doc embeddings
|
|
self.posteriorEmbedder.fit(lX, ly)
|
|
return self
|
|
|
|
def transform(self, lX, batch_size=64):
|
|
lX = self._get_doc_embeddings(lX)
|
|
return self.posteriorEmbedder.predict_proba(lX)
|
|
|
|
def fit_transform(self, lX, ly, lV=None):
|
|
# TODO
|
|
return 0
|
|
|
|
def _get_doc_embeddings(self, lX, batch_size=64):
|
|
assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
|
|
print('Generating document embeddings via GRU')
|
|
data = {}
|
|
for lang in lX.keys():
|
|
indexed = index(data=lX[lang],
|
|
vocab=self.multilingual_index.l_index[lang].word2index,
|
|
known_words=set(self.multilingual_index.l_index[lang].word2index.keys()),
|
|
analyzer=self.multilingual_index.l_vectorizer.get_analyzer(lang),
|
|
unk_index=self.multilingual_index.l_index[lang].unk_index,
|
|
out_of_vocabulary=self.multilingual_index.l_index[lang].out_of_vocabulary)
|
|
data[lang] = indexed
|
|
|
|
lX = {}
|
|
ly = {}
|
|
batcher_transform = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
|
|
lpad=self.multilingual_index.l_pad())
|
|
|
|
l_devel_index = self.multilingual_index.l_devel_index()
|
|
l_devel_target = self.multilingual_index.l_devel_target()
|
|
# l_devel_target = {k: v[:len(data[lang])] for k, v in l_devel_target.items()}
|
|
|
|
# for idx, (batch, post, bert_emb, target, lang) in enumerate(
|
|
# batcher_transform.batchify(l_devel_index, None, None, l_devel_target)):
|
|
for idx, (batch, post, bert_emb, target, lang) in enumerate(
|
|
batcher_transform.batchify(data, None, None, l_devel_target)):
|
|
if lang not in lX.keys():
|
|
lX[lang] = self.model.get_embeddings(batch, lang)
|
|
ly[lang] = target.cpu().detach().numpy()
|
|
else:
|
|
lX[lang] = np.concatenate((lX[lang], self.model.get_embeddings(batch, lang)), axis=0)
|
|
ly[lang] = np.concatenate((ly[lang], target.cpu().detach().numpy()), axis=0)
|
|
|
|
return lX
|
|
|
|
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
|
|
def _load_pretrained_embeddings(self, we_path, langs):
|
|
lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ?
|
|
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
|
|
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
|
|
return lpretrained, lpretrained_vocabulary
|
|
|
|
def _none_dict(self, langs):
|
|
return {l:None for l in langs}
|
|
|
|
# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested
|
|
def _init_Net(self, xavier_uniform=True):
|
|
model = RNNMultilingualClassifier(
|
|
output_size=self.nC,
|
|
hidden_size=self.hidden_size,
|
|
lvocab_size=self.multilingual_index.l_vocabsize(),
|
|
learnable_length=0,
|
|
lpretrained=self.multilingual_index.l_embeddings(),
|
|
drop_embedding_range=self.multilingual_index.sup_range,
|
|
drop_embedding_prop=self.sup_drop,
|
|
post_probabilities=self.posteriors
|
|
)
|
|
return model.cuda()
|
|
|
|
|
|
class DocEmbedderList:
|
|
|
|
def __init__(self, *embedder_list, aggregation='concat'):
|
|
assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"'
|
|
if len(embedder_list) == 0:
|
|
embedder_list = []
|
|
self.embedders = embedder_list
|
|
self.aggregation = aggregation
|
|
print(f'Aggregation mode: {self.aggregation}')
|
|
|
|
def fit(self, lX, ly, lV=None, tfidf=None):
|
|
for transformer in self.embedders:
|
|
_lX = lX
|
|
if transformer.requires_tfidf:
|
|
_lX = tfidf
|
|
transformer.fit(_lX, ly, lV)
|
|
return self
|
|
|
|
def transform(self, lX, tfidf=None):
|
|
if self.aggregation == 'concat':
|
|
return self.transform_concat(lX, tfidf)
|
|
elif self.aggregation == 'mean':
|
|
return self.transform_mean(lX, tfidf)
|
|
|
|
def transform_concat(self, lX, tfidf):
|
|
if len(self.embedders) == 1:
|
|
if self.embedders[0].requires_tfidf:
|
|
lX = tfidf
|
|
return self.embedders[0].transform(lX)
|
|
|
|
some_sparse = False
|
|
langs = sorted(lX.keys())
|
|
|
|
lZparts = {l: [] for l in langs}
|
|
for transformer in self.embedders:
|
|
_lX = lX
|
|
if transformer.requires_tfidf:
|
|
_lX = tfidf
|
|
lZ = transformer.transform(_lX)
|
|
for l in langs:
|
|
Z = lZ[l]
|
|
some_sparse = some_sparse or issparse(Z)
|
|
lZparts[l].append(Z)
|
|
|
|
hstacker = hstack if some_sparse else np.hstack
|
|
return {l: hstacker(lZparts[l]) for l in langs}
|
|
|
|
def transform_mean(self, lX, tfidf):
|
|
if len(self.embedders) == 1:
|
|
return self.embedders[0].transform(lX)
|
|
|
|
langs = sorted(lX.keys())
|
|
|
|
lZparts = {l: None for l in langs}
|
|
|
|
# min_dim = min([transformer._get_output_dim() for transformer in self.embedders])
|
|
min_dim = 73 # TODO <---- this should be the number of target classes
|
|
|
|
for transformer in self.embedders:
|
|
_lX = lX
|
|
if transformer.requires_tfidf:
|
|
_lX = tfidf
|
|
lZ = transformer.transform(_lX)
|
|
nC = min([lZ[lang].shape[1] for lang in langs])
|
|
for l in langs:
|
|
Z = lZ[l]
|
|
if Z.shape[1] > min_dim:
|
|
print(
|
|
f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.'
|
|
f'Applying PCA(n_components={min_dim})')
|
|
pca = PCA(n_components=min_dim)
|
|
Z = pca.fit(Z).transform(Z)
|
|
if lZparts[l] is None:
|
|
lZparts[l] = Z
|
|
else:
|
|
lZparts[l] += Z
|
|
|
|
n_transformers = len(self.embedders)
|
|
|
|
return {l: lZparts[l] / n_transformers for l in langs}
|
|
|
|
def fit_transform(self, lX, ly, lV=None, tfidf=None):
|
|
return self.fit(lX, ly, lV, tfidf).transform(lX, tfidf)
|
|
|
|
def best_params(self):
|
|
return {'todo'}
|
|
|
|
def append(self, embedder):
|
|
self.embedders.append(embedder)
|
|
|
|
|
|
class FeatureSet2Posteriors:
|
|
def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1):
|
|
self.transformer = transformer
|
|
self.l2 = l2
|
|
self.n_jobs = n_jobs
|
|
self.prob_classifier = MetaClassifier(
|
|
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
|
self.requires_tfidf = requires_tfidf
|
|
|
|
def fit(self, lX, ly, lV=None):
|
|
if lV is None and hasattr(self.transformer, 'lV'):
|
|
lV = self.transformer.lV
|
|
lZ = self.transformer.fit_transform(lX, ly, lV)
|
|
self.prob_classifier.fit(lZ, ly)
|
|
return self
|
|
|
|
def transform(self, lX):
|
|
lP = self.predict_proba(lX)
|
|
lP = _normalize(lP, self.l2)
|
|
return lP
|
|
|
|
def fit_transform(self, lX, ly, lV):
|
|
return self.fit(lX, ly, lV).transform(lX)
|
|
|
|
def predict(self, lX, ly=None):
|
|
lZ = self.transformer.transform(lX)
|
|
return self.prob_classifier.predict(lZ)
|
|
|
|
def predict_proba(self, lX, ly=None):
|
|
lZ = self.transformer.transform(lX)
|
|
return self.prob_classifier.predict_proba(lZ)
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
# Meta-Classifier (aka second-tier learner)
|
|
# ------------------------------------------------------------------
|
|
class MetaClassifier:
|
|
|
|
def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None):
|
|
self.n_jobs = n_jobs
|
|
self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
|
|
self.standardize_range = standardize_range
|
|
|
|
def fit(self, lZ, ly):
|
|
tinit = time.time()
|
|
Z, y = self.stack(lZ, ly)
|
|
|
|
self.standardizer = StandardizeTransformer(range=self.standardize_range)
|
|
Z = self.standardizer.fit_transform(Z)
|
|
|
|
print('fitting the Z-space of shape={}'.format(Z.shape))
|
|
self.model.fit(Z, y)
|
|
self.time = time.time() - tinit
|
|
|
|
def stack(self, lZ, ly=None):
|
|
langs = list(lZ.keys())
|
|
Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
|
|
if ly is not None:
|
|
y = np.vstack([ly[lang] for lang in langs])
|
|
return Z, y
|
|
else:
|
|
return Z
|
|
|
|
def predict(self, lZ, ly=None):
|
|
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
|
|
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
|
|
|
def predict_proba(self, lZ, ly=None):
|
|
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
|
|
return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs)
|
|
|
|
def best_params(self):
|
|
return self.model.best_params()
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
# Ensembling (aka Funnelling)
|
|
# ------------------------------------------------------------------
|
|
class Funnelling:
|
|
def __init__(self,
|
|
vectorizer: TfidfVectorizerMultilingual,
|
|
first_tier: DocEmbedderList,
|
|
meta: MetaClassifier):
|
|
self.vectorizer = vectorizer
|
|
self.first_tier = first_tier
|
|
self.meta = meta
|
|
self.n_jobs = meta.n_jobs
|
|
|
|
def fit(self, lX, ly):
|
|
tfidf_lX = self.vectorizer.fit_transform(lX, ly)
|
|
lV = self.vectorizer.vocabulary()
|
|
print('## Fitting first-tier learners!')
|
|
lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX)
|
|
print('## Fitting meta-learner!')
|
|
self.meta.fit(lZ, ly)
|
|
|
|
def predict(self, lX, ly=None):
|
|
tfidf_lX = self.vectorizer.transform(lX)
|
|
lZ = self.first_tier.transform(lX, tfidf=tfidf_lX)
|
|
ly_ = self.meta.predict(lZ)
|
|
return ly_
|
|
|
|
def best_params(self):
|
|
return {'1st-tier': self.first_tier.best_params(),
|
|
'meta': self.meta.best_params()}
|
|
|
|
|
|
class Voting:
|
|
def __init__(self, *prob_classifiers):
|
|
assert all([hasattr(p, 'predict_proba') for p in prob_classifiers]), 'not all classifiers are probabilistic'
|
|
self.prob_classifiers = prob_classifiers
|
|
|
|
def fit(self, lX, ly, lV=None):
|
|
for classifier in self.prob_classifiers:
|
|
classifier.fit(lX, ly, lV)
|
|
|
|
def predict(self, lX, ly=None):
|
|
lP = {l: [] for l in lX.keys()}
|
|
for classifier in self.prob_classifiers:
|
|
lPi = classifier.predict_proba(lX)
|
|
for l in lX.keys():
|
|
lP[l].append(lPi[l])
|
|
|
|
lP = {l: np.stack(Plist).mean(axis=0) for l, Plist in lP.items()}
|
|
ly = {l: P > 0.5 for l, P in lP.items()}
|
|
|
|
return ly
|
|
|
|
|
|
# ------------------------------------------------------------------------------
|
|
# HELPERS
|
|
# ------------------------------------------------------------------------------
|
|
|
|
def load_muse_embeddings(we_path, langs, n_jobs=-1):
|
|
MUSE = Parallel(n_jobs=n_jobs)(
|
|
delayed(FastTextMUSE)(we_path, lang) for lang in langs
|
|
)
|
|
return {l: MUSE[i] for i, l in enumerate(langs)}
|
|
|
|
|
|
def word_class_embedding_matrix(X, Y, max_label_space=300):
|
|
WCE = supervised_embeddings_tfidf(X, Y)
|
|
WCE = zscores(WCE, axis=0)
|
|
|
|
nC = Y.shape[1]
|
|
if nC > max_label_space:
|
|
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
|
f'Applying PCA(n_components={max_label_space})')
|
|
pca = PCA(n_components=max_label_space)
|
|
WCE = pca.fit(WCE).transform(WCE)
|
|
|
|
return WCE
|
|
|
|
|
|
def XdotM(X, M, sif):
|
|
E = X.dot(M)
|
|
if sif:
|
|
print("removing pc...")
|
|
E = remove_pc(E, npc=1)
|
|
return E
|
|
|
|
|
|
def _normalize(lX, l2=True):
|
|
return {l: normalize(X) for l, X in lX.items()} if l2 else lX
|
|
|
|
|
|
class BatchGRU:
|
|
def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
|
|
self.batchsize = batchsize
|
|
self.batches_per_epoch = batches_per_epoch
|
|
self.languages = languages
|
|
self.lpad=lpad
|
|
self.max_pad_length=max_pad_length
|
|
self.init_offset()
|
|
|
|
def init_offset(self):
|
|
self.offset = {lang: 0 for lang in self.languages}
|
|
|
|
def batchify(self, l_index, l_post, l_bert, llabels):
|
|
langs = self.languages
|
|
l_num_samples = {l:len(l_index[l]) for l in langs}
|
|
|
|
max_samples = max(l_num_samples.values())
|
|
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
|
|
if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches:
|
|
n_batches = self.batches_per_epoch
|
|
|
|
for b in range(n_batches):
|
|
for lang in langs:
|
|
index, labels = l_index[lang], llabels[lang]
|
|
offset = self.offset[lang]
|
|
if offset >= l_num_samples[lang]:
|
|
offset = 0
|
|
limit = offset+self.batchsize
|
|
|
|
batch_slice = slice(offset, limit)
|
|
batch = index[batch_slice]
|
|
batch_labels = labels[batch_slice].toarray()
|
|
|
|
post = None
|
|
bert_emb = None
|
|
|
|
batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
|
|
batch = torch.LongTensor(batch).cuda()
|
|
target = torch.FloatTensor(batch_labels).cuda()
|
|
|
|
self.offset[lang] = limit
|
|
|
|
yield batch, post, bert_emb, target, lang
|
|
|
|
|
|
def pad(index_list, pad_index, max_pad_length=None):
|
|
pad_length = np.max([len(index) for index in index_list])
|
|
if max_pad_length is not None:
|
|
pad_length = min(pad_length, max_pad_length)
|
|
for i,indexes in enumerate(index_list):
|
|
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
|
|
return index_list
|
|
|
|
|
|
def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, optim, epoch, method_name, opt,
|
|
ltrain_posteriors=None, ltrain_bert=None, log_interval=10):
|
|
_dataset_path = opt.dataset.split('/')[-1].split('_')
|
|
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
|
|
|
loss_history = []
|
|
model.train()
|
|
for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
|
|
optim.zero_grad()
|
|
loss = criterion(model(batch, post, bert_emb, lang), target)
|
|
loss.backward()
|
|
clip_gradient(model)
|
|
optim.step()
|
|
loss_history.append(loss.item())
|
|
|
|
if idx % log_interval == 0:
|
|
interval_loss = np.mean(loss_history[-log_interval:])
|
|
print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, '
|
|
f'Training Loss: {interval_loss:.6f}')
|
|
|
|
mean_loss = np.mean(interval_loss)
|
|
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time.time() - tinit)
|
|
return mean_loss
|
|
|
|
|
|
def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix):
|
|
loss_history = []
|
|
model.eval()
|
|
langs = sorted(ltest_index.keys())
|
|
predictions = {l: [] for l in langs}
|
|
yte_stacked = {l: [] for l in langs}
|
|
batcher.init_offset()
|
|
for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte),
|
|
desc='evaluation: '):
|
|
logits = model(batch, post, bert_emb, lang)
|
|
loss = criterion(logits, target).item()
|
|
prediction = predict(logits)
|
|
predictions[lang].append(prediction)
|
|
yte_stacked[lang].append(target.detach().cpu().numpy())
|
|
loss_history.append(loss)
|
|
|
|
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
|
|
ly_ = {l:np.vstack(predictions[l]) for l in langs}
|
|
l_eval = evaluate(ly, ly_)
|
|
metrics = []
|
|
for lang in langs:
|
|
macrof1, microf1, macrok, microk = l_eval[lang]
|
|
metrics.append([macrof1, microf1, macrok, microk])
|
|
if measure_prefix == 'te':
|
|
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
|
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
|
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
|
|
|
mean_loss = np.mean(loss_history)
|
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time.time() - tinit)
|
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time.time() - tinit)
|
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time.time() - tinit)
|
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time.time() - tinit)
|
|
logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time.time() - tinit)
|
|
|
|
return Mf1
|
|
|
|
|
|
def clip_gradient(model, clip_value=1e-1):
|
|
params = list(filter(lambda p: p.grad is not None, model.parameters()))
|
|
for p in params:
|
|
p.grad.data.clamp_(-clip_value, clip_value)
|
|
|
|
|
|
def init_logfile_nn(method_name, opt):
|
|
logfile = CSVLog(opt.logfile_gru, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
|
|
logfile.set_default('dataset', opt.dataset)
|
|
logfile.set_default('run', opt.seed)
|
|
logfile.set_default('method', method_name)
|
|
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
|
|
f'and run {opt.seed} already calculated'
|
|
return logfile
|