diff --git a/src/dataset_builder.py b/src/dataset_builder.py index 454fea6..b9650c7 100644 --- a/src/dataset_builder.py +++ b/src/dataset_builder.py @@ -121,11 +121,10 @@ class MultilingualDataset: print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape)) def show_category_prevalences(self): - #pass nC = self.num_categories() accum_tr = np.zeros(nC, dtype=np.int) accum_te = np.zeros(nC, dtype=np.int) - in_langs = np.zeros(nC, dtype=np.int) #count languages with at least one positive example (per category) + in_langs = np.zeros(nC, dtype=np.int) # count languages with at least one positive example (per category) for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): if lang not in self.langs(): continue prev_train = np.sum(self.cat_view(Ytr), axis=0) diff --git a/src/embeddings/embeddings.py b/src/embeddings/embeddings.py index 59a87a1..27367e9 100644 --- a/src/embeddings/embeddings.py +++ b/src/embeddings/embeddings.py @@ -47,7 +47,6 @@ class FastTextWikiNews(Vectors): class FastTextMUSE(PretrainedEmbeddings): def __init__(self, path, lang, limit=None): super().__init__() - print(f'Loading fastText pretrained vectors for language {lang} from {path}') assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') self.embed = FastTextWikiNews(path, lang, max_vectors=limit) diff --git a/src/extract_features.sh b/src/extract_features.sh new file mode 100644 index 0000000..d0bd3ac --- /dev/null +++ b/src/extract_features.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run# + +runs='1 2 3 4 5 6 7 8 9' +for run in $runs +do + dataset=$dataset_path$run.pickle + modelpath=/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run$runs + python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath +done + +dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle +python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath \ No newline at end of file diff --git a/src/learning/learners.py b/src/learning/learners.py index 8d82b48..89e3830 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -133,7 +133,8 @@ class MonolingualClassifier: self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs) else: self.model = self.learner - raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in the labels across languages') + raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in ' + 'the labels across languages') # parameter optimization? if self.parameters: @@ -141,7 +142,8 @@ class MonolingualClassifier: self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs, error_score=0, verbose=10) - print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}') + # print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}') + print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}') self.model.fit(X, y) if isinstance(self.model, GridSearchCV): self.best_params_ = self.model.best_params_ diff --git a/src/learning/transformers.py b/src/learning/transformers.py index 29d35c8..c669389 100644 --- a/src/learning/transformers.py +++ b/src/learning/transformers.py @@ -1,65 +1,39 @@ -import numpy as np -from sklearn.feature_extraction.text import TfidfVectorizer -#from data.text_preprocessor import NLTKStemTokenizer -from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain, \ - gain_ratio, gss +from torch.optim.lr_scheduler import StepLR +from torch.utils.data import DataLoader +from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain from embeddings.embeddings import FastTextMUSE from embeddings.supervised import supervised_embeddings_tfidf, zscores from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling -import time from sklearn.decomposition import PCA -from joblib import Parallel, delayed -from scipy.sparse import issparse, vstack, hstack +from scipy.sparse import hstack from util_transformers.StandardizeTransformer import StandardizeTransformer from util.SIF_embed import remove_pc from sklearn.preprocessing import normalize -from sklearn.svm import SVC from scipy.sparse import csr_matrix +from models.mBert import * +from models.lstm_class import * +from util.csv_log import CSVLog +from util.file import get_file_name +from util.early_stop import EarlyStopping +from util.common import * +import time + # ------------------------------------------------------------------ # Data Processing # ------------------------------------------------------------------ -class TfidfVectorizerMultilingual: - - def __init__(self, **kwargs): - self.kwargs=kwargs - - def fit(self, lX, ly=None): - self.langs = sorted(lX.keys()) - self.vectorizer={l:TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} - # tokenizer=NLTKStemTokenizer(l, verbose=True), - return self - - def transform(self, lX): - return {l:self.vectorizer[l].transform(lX[l]) for l in self.langs} - - def fit_transform(self, lX, ly=None): - return self.fit(lX,ly).transform(lX) - - def vocabulary(self, l=None): - if l is None: - return {l:self.vectorizer[l].vocabulary_ for l in self.langs} - else: - return self.vectorizer[l].vocabulary_ - - def get_analyzer(self, l=None): - if l is None: - return {l:self.vectorizer[l].build_analyzer() for l in self.langs} - else: - return self.vectorizer[l].build_analyzer() - - class FeatureWeight: def __init__(self, weight='tfidf', agg='mean'): - assert weight in ['tfidf', 'pmi', 'ig'] or callable(weight), 'weight should either be "tfidf" or a callable function' + assert weight in ['tfidf', 'pmi', 'ig'] or callable( + weight), 'weight should either be "tfidf" or a callable function' assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"' self.weight = weight self.agg = agg self.fitted = False - if weight=='pmi': + if weight == 'pmi': self.weight = pointwise_mutual_information elif weight == 'ig': self.weight = information_gain @@ -91,8 +65,10 @@ class FeatureWeight: return self.fit(lX, ly).transform(lX) # ------------------------------------------------------------------ -# Document Embeddings +# View Generators (aka first-tier learners) # ------------------------------------------------------------------ + + class PosteriorProbabilitiesEmbedder: def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1): @@ -103,9 +79,13 @@ class PosteriorProbabilitiesEmbedder: self.doc_projector = NaivePolylingualClassifier( self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs ) + self.requires_tfidf = True - def fit(self, lX, lY, lV=None): - print('fitting the projectors... {}'.format(lX.keys())) + def fit(self, lX, lY, lV=None, called_by_viewgen=False): + if not called_by_viewgen: + # Avoid printing if method is called by another View Gen (e.g., GRU ViewGen) + print('### Posterior Probabilities View Generator (X)') + print('fitting the projectors... {}'.format(lX.keys())) self.doc_projector.fit(lX, lY) return self @@ -124,7 +104,7 @@ class PosteriorProbabilitiesEmbedder: return self.doc_projector.predict(lX) def predict_proba(self, lX, ly=None): - print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents') + print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents') return self.doc_projector.predict_proba(lX) def _get_output_dim(self): @@ -134,19 +114,22 @@ class PosteriorProbabilitiesEmbedder: class MuseEmbedder: def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False): - self.path=path + self.path = path self.lV = lV self.l2 = l2 self.n_jobs = n_jobs self.featureweight = featureweight self.sif = sif + self.requires_tfidf = True def fit(self, lX, ly, lV=None): assert lV is not None or self.lV is not None, 'lV not specified' + print('### MUSE View Generator (M)') + print(f'Loading fastText pretrained vectors for languages {list(lX.keys())}...') self.langs = sorted(lX.keys()) self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs) - lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs} - self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE.items()} + lWordList = {l: self._get_wordlist_from_word2index(lV[l]) for l in self.langs} + self.MUSE = {l: Muse.extract(lWordList[l]).numpy() for l, Muse in self.MUSE.items()} self.featureweight.fit(lX, ly) return self @@ -175,16 +158,19 @@ class WordClassEmbedder: def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False): self.n_jobs = n_jobs self.l2 = l2 - self.max_label_space=max_label_space + self.max_label_space = max_label_space self.featureweight = featureweight self.sif = sif + self.requires_tfidf = True def fit(self, lX, ly, lV=None): + print('### WCE View Generator (M)') + print('Computing supervised embeddings...') self.langs = sorted(lX.keys()) WCE = Parallel(n_jobs=self.n_jobs)( delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs ) - self.lWCE = {l:WCE[i] for i,l in enumerate(self.langs)} + self.lWCE = {l: WCE[i] for i, l in enumerate(self.langs)} self.featureweight.fit(lX, ly) return self @@ -192,7 +178,7 @@ class WordClassEmbedder: lWCE = self.lWCE lX = self.featureweight.transform(lX) XdotWCE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], lWCE[lang], self.sif)for lang in self.langs + delayed(XdotM)(lX[lang], lWCE[lang], self.sif) for lang in self.langs ) lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)} lwce = _normalize(lwce, self.l2) @@ -202,31 +188,284 @@ class WordClassEmbedder: return self.fit(lX, ly).transform(lX) def _get_output_dim(self): - return 73 + return 73 # TODO ! + + +class MBertEmbedder: + + def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None, + nC=None): + self.doc_embed_path = doc_embed_path + self.patience = patience + self.checkpoint_dir = checkpoint_dir + self.fitted = False + self.requires_tfidf = False + if path_to_model is None and nC is not None: + self.model = None + else: + config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, + num_labels=nC) + self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda() + self.fitted = True + + def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1): + print('### mBERT View Generator (B)') + if self.fitted is True: + print('Bert model already fitted!') + return self + + print('Fine-tune mBert on the given dataset.') + l_tokenized_tr = do_tokenization(lX, max_len=512) + l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly, + val_prop=0.2, max_val=2000, + seed=seed) # TODO: seed + + tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) + va_dataset = TrainingDataset(l_split_va, l_split_val_target) + tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True) + va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True) + + nC = tr_dataset.get_nclasses() + model = get_model(nC) + model = model.cuda() + criterion = torch.nn.BCEWithLogitsLoss().cuda() + optim = init_optimizer(model, lr=lr, weight_decay=0.01) + lr_scheduler = StepLR(optim, step_size=25, gamma=0.1) + early_stop = EarlyStopping(model, optimizer=optim, patience=self.patience, + checkpoint=self.checkpoint_dir, + is_bert=True) + + # Training loop + logfile = '../log/log_mBert_extractor.csv' + method_name = 'mBert_feature_extractor' + + tinit = time() + lang_ids = va_dataset.lang_ids + for epoch in range(1, nepochs + 1): + print('# Start Training ...') + train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile) + lr_scheduler.step() # reduces the learning rate # TODO arg epoch? + + # Validation + macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va') + early_stop(macrof1, epoch) + + if early_stop.STOP: + print('[early-stop] STOP') + break + + model = early_stop.restore_checkpoint() + self.model = model.cuda() + + if val_epochs > 0: + print(f'running last {val_epochs} training epochs on the validation set') + for val_epoch in range(1, val_epochs + 1): + train(self.model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile) + + self.fitted = True + return self + + def transform(self, lX): + assert self.fitted is True, 'Calling transform without any initialized model! - call init first or on init' \ + 'pass the "path_to_model" arg.' + print('Obtaining document embeddings from pretrained mBert ') + l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True) + feat_dataset = ExtractorDataset(l_tokenized_X) + feat_lang_ids = feat_dataset.lang_ids + dataloader = DataLoader(feat_dataset, batch_size=64) + all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model) + return all_batch_embeddings + + def fit_transform(self, lX, ly, lV=None): + return self.fit(lX, ly).transform(lX) + + +class RecurrentEmbedder: + + def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3, + we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10, + test_each=0, checkpoint_dir='../checkpoint', model_path=None): + self.pretrained = pretrained + self.supervised = supervised + self.concat = concat + self.requires_tfidf = False + self.multilingual_dataset = multilingual_dataset + self.model = None + self.we_path = we_path + self.langs = multilingual_dataset.langs() + self.hidden_size = hidden_size + self.sup_drop = sup_drop + self.posteriors = posteriors + self.patience = patience + self.checkpoint_dir = checkpoint_dir + self.test_each = test_each + self.options = options + self.seed = options.seed + self.is_trained = False + + ## INIT MODEL for training + self.lXtr, self.lytr = self.multilingual_dataset.training(target_as_csr=True) + self.lXte, self.lyte = self.multilingual_dataset.test(target_as_csr=True) + self.nC = self.lyte[self.langs[0]].shape[1] + lpretrained, lpretrained_vocabulary = self._load_pretrained_embeddings(self.we_path, self.langs) + self.multilingual_index = MultilingualIndex() + self.multilingual_index.index(self.lXtr, self.lytr, self.lXte, lpretrained_vocabulary) + self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed) + self.multilingual_index.embedding_matrices(lpretrained, self.supervised) + + if model_path is not None: + self.is_trained = True + self.model = torch.load(model_path) + else: + self.model = self._init_Net() + + self.optim = init_optimizer(self.model, lr=lr) + self.criterion = torch.nn.BCEWithLogitsLoss().cuda() + self.lr_scheduler = StepLR(self.optim, step_size=25, gamma=0.5) + self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience, + checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}') + # Init SVM in order to recast (vstacked) document embeddings to vectors of Posterior Probabilities + self.posteriorEmbedder = MetaClassifier( + SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=options.n_jobs) + + def fit(self, lX, ly, lV=None, batch_size=64, nepochs=200, val_epochs=1): + print('### Gated Recurrent Unit View Generator (G)') + if not self.is_trained: + # Batchify input + self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed) + l_train_index, l_train_target = self.multilingual_index.l_train() + l_val_index, l_val_target = self.multilingual_index.l_val() + l_test_index = self.multilingual_index.l_test_index() + batcher_train = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs, + lpad=self.multilingual_index.l_pad()) + batcher_eval = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs, + lpad=self.multilingual_index.l_pad()) + + # Train loop + print('Start training') + method_name = 'gru_view_generator' + logfile = init_logfile_nn(method_name, self.options) + tinit = time.time() + for epoch in range(1, nepochs + 1): + train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target, + tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim, + epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None, + ltrain_bert=None) + self.lr_scheduler.step() # reduces the learning rate # TODO arg epoch? + + # validation step + macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch, + logfile, self.criterion, 'va') + + self.early_stop(macrof1, epoch) + if self.test_each > 0: + test_gru(self.model, batcher_eval, l_test_index, None, None, self.lyte, tinit, epoch, + logfile, self.criterion, 'te') + + if self.early_stop.STOP: + print('[early-stop] STOP') + print('Restoring best model...') + break + + self.model = self.early_stop.restore_checkpoint() + print(f'running last {val_epochs} training epochs on the validation set') + for val_epoch in range(1, val_epochs+1): + batcher_train.init_offset() + train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target, + tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim, + epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None, + ltrain_bert=None) + self.is_trained = True + + # Generate document embeddings in order to fit an SVM to recast them as vector for Posterior Probabilities + lX = self._get_doc_embeddings(lX) + # Fit a ''multi-lingual'' SVM on the generated doc embeddings + self.posteriorEmbedder.fit(lX, ly) + return self + + def transform(self, lX, batch_size=64): + lX = self._get_doc_embeddings(lX) + return self.posteriorEmbedder.predict_proba(lX) + + def fit_transform(self, lX, ly, lV=None): + # TODO + return 0 + + def _get_doc_embeddings(self, lX, batch_size=64): + assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!' + print('Generating document embeddings via GRU') + lX = {} + ly = {} + batcher_transform = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs, + lpad=self.multilingual_index.l_pad()) + + l_devel_index = self.multilingual_index.l_devel_index() + l_devel_target = self.multilingual_index.l_devel_target() + + for idx, (batch, post, bert_emb, target, lang) in enumerate( + batcher_transform.batchify(l_devel_index, None, None, l_devel_target)): + if lang not in lX.keys(): + lX[lang] = self.model.get_embeddings(batch, lang) + ly[lang] = target.cpu().detach().numpy() + else: + lX[lang] = np.concatenate((lX[lang], self.model.get_embeddings(batch, lang)), axis=0) + ly[lang] = np.concatenate((ly[lang], target.cpu().detach().numpy()), axis=0) + + return lX + + # loads the MUSE embeddings if requested, or returns empty dictionaries otherwise + def _load_pretrained_embeddings(self, we_path, langs): + lpretrained = lpretrained_vocabulary = self._none_dict(langs) # TODO ? + lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1) + lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} + return lpretrained, lpretrained_vocabulary + + def _none_dict(self, langs): + return {l:None for l in langs} + + # instantiates the net, initializes the model parameters, and sets embeddings trainable if requested + def _init_Net(self, xavier_uniform=True): + model = RNNMultilingualClassifier( + output_size=self.nC, + hidden_size=self.hidden_size, + lvocab_size=self.multilingual_index.l_vocabsize(), + learnable_length=0, + lpretrained=self.multilingual_index.l_embeddings(), + drop_embedding_range=self.multilingual_index.sup_range, + drop_embedding_prop=self.sup_drop, + post_probabilities=self.posteriors + ) + return model.cuda() class DocEmbedderList: def __init__(self, *embedder_list, aggregation='concat'): assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"' - if len(embedder_list)==0: embedder_list=[] + if len(embedder_list) == 0: + embedder_list = [] self.embedders = embedder_list self.aggregation = aggregation print(f'Aggregation mode: {self.aggregation}') - def fit(self, lX, ly, lV=None): + def fit(self, lX, ly, lV=None, tfidf=None): for transformer in self.embedders: - transformer.fit(lX,ly,lV) + _lX = lX + if transformer.requires_tfidf: + _lX = tfidf + transformer.fit(_lX, ly, lV) return self - def transform(self, lX): + def transform(self, lX, tfidf=None): if self.aggregation == 'concat': - return self.transform_concat(lX) + return self.transform_concat(lX, tfidf) elif self.aggregation == 'mean': - return self.transform_mean(lX) + return self.transform_mean(lX, tfidf) - def transform_concat(self, lX): - if len(self.embedders)==1: + def transform_concat(self, lX, tfidf): + if len(self.embedders) == 1: + if self.embedders[0].requires_tfidf: + lX = tfidf return self.embedders[0].transform(lX) some_sparse = False @@ -234,32 +473,41 @@ class DocEmbedderList: lZparts = {l: [] for l in langs} for transformer in self.embedders: - lZ = transformer.transform(lX) + _lX = lX + if transformer.requires_tfidf: + _lX = tfidf + lZ = transformer.transform(_lX) for l in langs: Z = lZ[l] some_sparse = some_sparse or issparse(Z) lZparts[l].append(Z) hstacker = hstack if some_sparse else np.hstack - return {l:hstacker(lZparts[l]) for l in langs} + return {l: hstacker(lZparts[l]) for l in langs} - def transform_mean(self, lX): - if len(self.embedders)==1: + def transform_mean(self, lX, tfidf): + if len(self.embedders) == 1: return self.embedders[0].transform(lX) langs = sorted(lX.keys()) lZparts = {l: None for l in langs} + # min_dim = min([transformer._get_output_dim() for transformer in self.embedders]) - min_dim = 300 + min_dim = 73 # TODO <---- this should be the number of target classes + for transformer in self.embedders: - lZ = transformer.transform(lX) + _lX = lX + if transformer.requires_tfidf: + _lX = tfidf + lZ = transformer.transform(_lX) nC = min([lZ[lang].shape[1] for lang in langs]) for l in langs: Z = lZ[l] if Z.shape[1] > min_dim: - print(f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.' - f'Applying PCA(n_components={min_dim})') + print( + f'Space Z matrix has more dimensions ({Z.shape[1]}) than the smallest representation {min_dim}.' + f'Applying PCA(n_components={min_dim})') pca = PCA(n_components=min_dim) Z = pca.fit(Z).transform(Z) if lZparts[l] is None: @@ -268,12 +516,11 @@ class DocEmbedderList: lZparts[l] += Z n_transformers = len(self.embedders) - nC = min([lZparts[lang].shape[1] for lang in langs]) - return {l:lZparts[l] / n_transformers for l in langs} + return {l: lZparts[l] / n_transformers for l in langs} - def fit_transform(self, lX, ly, lV=None): - return self.fit(lX, ly, lV).transform(lX) + def fit_transform(self, lX, ly, lV=None, tfidf=None): + return self.fit(lX, ly, lV, tfidf).transform(lX, tfidf) def best_params(self): return {'todo'} @@ -283,11 +530,13 @@ class DocEmbedderList: class FeatureSet2Posteriors: - def __init__(self, transformer, l2=True, n_jobs=-1): + def __init__(self, transformer, requires_tfidf=False, l2=True, n_jobs=-1): self.transformer = transformer - self.l2=l2 + self.l2 = l2 self.n_jobs = n_jobs - self.prob_classifier = MetaClassifier(SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) + self.prob_classifier = MetaClassifier( + SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) + self.requires_tfidf = requires_tfidf def fit(self, lX, ly, lV=None): if lV is None and hasattr(self.transformer, 'lV'): @@ -314,12 +563,12 @@ class FeatureSet2Posteriors: # ------------------------------------------------------------------ -# Meta-Classifier +# Meta-Classifier (aka second-tier learner) # ------------------------------------------------------------------ class MetaClassifier: def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None): - self.n_jobs=n_jobs + self.n_jobs = n_jobs self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs) self.standardize_range = standardize_range @@ -354,34 +603,37 @@ class MetaClassifier: def best_params(self): return self.model.best_params() + # ------------------------------------------------------------------ -# Ensembling +# Ensembling (aka Funnelling) # ------------------------------------------------------------------ class Funnelling: def __init__(self, - vectorizer:TfidfVectorizerMultilingual, - first_tier:DocEmbedderList, - meta:MetaClassifier): + vectorizer: TfidfVectorizerMultilingual, + first_tier: DocEmbedderList, + meta: MetaClassifier): self.vectorizer = vectorizer self.first_tier = first_tier self.meta = meta self.n_jobs = meta.n_jobs def fit(self, lX, ly): - lX = self.vectorizer.fit_transform(lX, ly) + tfidf_lX = self.vectorizer.fit_transform(lX, ly) lV = self.vectorizer.vocabulary() - lZ = self.first_tier.fit_transform(lX, ly, lV) + print('## Fitting first-tier learners!') + lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX) + print('## Fitting meta-learner!') self.meta.fit(lZ, ly) def predict(self, lX, ly=None): - lX = self.vectorizer.transform(lX) - lZ = self.first_tier.transform(lX) + tfidf_lX = self.vectorizer.transform(lX) + lZ = self.first_tier.transform(lX, tfidf=tfidf_lX) ly_ = self.meta.predict(lZ) return ly_ def best_params(self): - return {'1st-tier':self.first_tier.best_params(), - 'meta':self.meta.best_params()} + return {'1st-tier': self.first_tier.best_params(), + 'meta': self.meta.best_params()} class Voting: @@ -394,15 +646,14 @@ class Voting: classifier.fit(lX, ly, lV) def predict(self, lX, ly=None): - - lP = {l:[] for l in lX.keys()} + lP = {l: [] for l in lX.keys()} for classifier in self.prob_classifiers: lPi = classifier.predict_proba(lX) for l in lX.keys(): lP[l].append(lPi[l]) - lP = {l:np.stack(Plist).mean(axis=0) for l,Plist in lP.items()} - ly = {l:P>0.5 for l,P in lP.items()} + lP = {l: np.stack(Plist).mean(axis=0) for l, Plist in lP.items()} + ly = {l: P > 0.5 for l, P in lP.items()} return ly @@ -419,7 +670,6 @@ def load_muse_embeddings(we_path, langs, n_jobs=-1): def word_class_embedding_matrix(X, Y, max_label_space=300): - print('computing supervised embeddings...') WCE = supervised_embeddings_tfidf(X, Y) WCE = zscores(WCE, axis=0) @@ -433,9 +683,7 @@ def word_class_embedding_matrix(X, Y, max_label_space=300): return WCE -def XdotM(X,M, sif): - # return X.dot(M) - print(f'X={X.shape}, M={M.shape}') +def XdotM(X, M, sif): E = X.dot(M) if sif: print("removing pc...") @@ -444,6 +692,137 @@ def XdotM(X,M, sif): def _normalize(lX, l2=True): - return {l: normalize(X) for l, X in lX.items()} if l2 else lX + return {l: normalize(X) for l, X in lX.items()} if l2 else lX +class BatchGRU: + def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500): + self.batchsize = batchsize + self.batches_per_epoch = batches_per_epoch + self.languages = languages + self.lpad=lpad + self.max_pad_length=max_pad_length + self.init_offset() + + def init_offset(self): + self.offset = {lang: 0 for lang in self.languages} + + def batchify(self, l_index, l_post, l_bert, llabels): + langs = self.languages + l_num_samples = {l:len(l_index[l]) for l in langs} + + max_samples = max(l_num_samples.values()) + n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0) + if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches: + n_batches = self.batches_per_epoch + + for b in range(n_batches): + for lang in langs: + index, labels = l_index[lang], llabels[lang] + offset = self.offset[lang] + if offset >= l_num_samples[lang]: + offset = 0 + limit = offset+self.batchsize + + batch_slice = slice(offset, limit) + batch = index[batch_slice] + batch_labels = labels[batch_slice].toarray() + + post = None + bert_emb = None + + batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length) + batch = torch.LongTensor(batch).cuda() + target = torch.FloatTensor(batch_labels).cuda() + + self.offset[lang] = limit + + yield batch, post, bert_emb, target, lang + + +def pad(index_list, pad_index, max_pad_length=None): + pad_length = np.max([len(index) for index in index_list]) + if max_pad_length is not None: + pad_length = min(pad_length, max_pad_length) + for i,indexes in enumerate(index_list): + index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length] + return index_list + + +def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, optim, epoch, method_name, opt, + ltrain_posteriors=None, ltrain_bert=None, log_interval=10): + _dataset_path = opt.dataset.split('/')[-1].split('_') + dataset_id = _dataset_path[0] + _dataset_path[-1] + + loss_history = [] + model.train() + for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)): + optim.zero_grad() + loss = criterion(model(batch, post, bert_emb, lang), target) + loss.backward() + clip_gradient(model) + optim.step() + loss_history.append(loss.item()) + + if idx % log_interval == 0: + interval_loss = np.mean(loss_history[-log_interval:]) + print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, ' + f'Training Loss: {interval_loss:.6f}') + + mean_loss = np.mean(interval_loss) + logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time.time() - tinit) + return mean_loss + + +def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix): + loss_history = [] + model.eval() + langs = sorted(ltest_index.keys()) + predictions = {l: [] for l in langs} + yte_stacked = {l: [] for l in langs} + batcher.init_offset() + for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), + desc='evaluation: '): + logits = model(batch, post, bert_emb, lang) + loss = criterion(logits, target).item() + prediction = predict(logits) + predictions[lang].append(prediction) + yte_stacked[lang].append(target.detach().cpu().numpy()) + loss_history.append(loss) + + ly = {l:np.vstack(yte_stacked[l]) for l in langs} + ly_ = {l:np.vstack(predictions[l]) for l in langs} + l_eval = evaluate(ly, ly_) + metrics = [] + for lang in langs: + macrof1, microf1, macrok, microk = l_eval[lang] + metrics.append([macrof1, microf1, macrok, microk]) + if measure_prefix == 'te': + print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') + Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) + print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') + + mean_loss = np.mean(loss_history) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time.time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time.time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time.time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time.time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time.time() - tinit) + + return Mf1 + + +def clip_gradient(model, clip_value=1e-1): + params = list(filter(lambda p: p.grad is not None, model.parameters())) + for p in params: + p.grad.data.clamp_(-clip_value, clip_value) + + +def init_logfile_nn(method_name, opt): + logfile = CSVLog(opt.logfile_gru, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) + logfile.set_default('dataset', opt.dataset) + logfile.set_default('run', opt.seed) + logfile.set_default('method', method_name) + assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \ + f'and run {opt.seed} already calculated' + return logfile diff --git a/src/main_deep_learning.py b/src/main_deep_learning.py index d330b04..a911b14 100755 --- a/src/main_deep_learning.py +++ b/src/main_deep_learning.py @@ -12,6 +12,7 @@ from time import time from tqdm import tqdm from util.evaluation import evaluate from util.file import get_file_name +# import pickle allowed_nets = {'rnn'} @@ -34,7 +35,8 @@ def init_Net(nC, multilingual_index, xavier_uniform=True): drop_embedding_range=multilingual_index.sup_range, drop_embedding_prop=opt.sup_drop, post_probabilities=opt.posteriors, - only_post=only_post + only_post=only_post, + bert_embeddings=opt.mbert ) # weight initialization @@ -59,8 +61,10 @@ def set_method_name(): method_name += f'-WCE' if opt.posteriors: method_name += f'-Posteriors' + if opt.mbert: + method_name += f'-mBert' if (opt.pretrained or opt.supervised) and opt.tunable: - method_name+='-(trainable)' + method_name += '-(trainable)' else: method_name += '-(static)' if opt.learnable > 0: @@ -77,7 +81,8 @@ def init_logfile(method_name, opt): logfile.set_default('dataset', opt.dataset) logfile.set_default('run', opt.seed) logfile.set_default('method', method_name) - assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated' + assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \ + f'and run {opt.seed} already calculated' return logfile @@ -90,15 +95,83 @@ def load_pretrained_embeddings(we_path, langs): return lpretrained, lpretrained_vocabulary +def get_lr(optimizer): + for param_group in optimizer.param_groups: + return param_group['lr'] + + +def train(model, batcher, ltrain_index, ltrain_posteriors, ltrain_bert, lytr, tinit, logfile, criterion, optim, epoch, method_name): + _dataset_path = opt.dataset.split('/')[-1].split('_') + dataset_id = _dataset_path[0] + _dataset_path[-1] + + loss_history = [] + model.train() + for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)): + optim.zero_grad() + # _out = model(batch, post, bert_emb, lang) + loss = criterion(model(batch, post, bert_emb, lang), target) + loss.backward() + clip_gradient(model) + optim.step() + loss_history.append(loss.item()) + + if idx % opt.log_interval == 0: + interval_loss = np.mean(loss_history[-opt.log_interval:]) + print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') + + mean_loss = np.mean(interval_loss) + logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) + return mean_loss + + +def test(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix): + + loss_history = [] + model.eval() + langs = sorted(ltest_index.keys()) + predictions = {l:[] for l in langs} + yte_stacked = {l:[] for l in langs} + batcher.init_offset() + for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), desc='evaluation: '): + logits = model(batch, post, bert_emb, lang) + loss = criterion(logits, target).item() + prediction = predict(logits) + predictions[lang].append(prediction) + yte_stacked[lang].append(target.detach().cpu().numpy()) + loss_history.append(loss) + + ly = {l:np.vstack(yte_stacked[l]) for l in langs} + ly_ = {l:np.vstack(predictions[l]) for l in langs} + l_eval = evaluate(ly, ly_) + metrics = [] + for lang in langs: + macrof1, microf1, macrok, microk = l_eval[lang] + metrics.append([macrof1, microf1, macrok, microk]) + if measure_prefix == 'te': + print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') + Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) + print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') + + mean_loss = np.mean(loss_history) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) + + return Mf1 + + # ---------------------------------------------------------------------------------------------------------------------- def main(): + DEBUGGING = False method_name = set_method_name() logfile = init_logfile(method_name, opt) # Loading the dataset data = MultilingualDataset.load(opt.dataset) - data.set_view(languages=['de', 'fr']) #, 'it', 'en']) # 'sv', 'da', 'es', 'it']) + # data.set_view(languages=['it', 'fr']) # Testing with less langs data.show_dimensions() langs = data.langs() l_devel_raw, l_devel_target = data.training(target_as_csr=True) @@ -114,25 +187,36 @@ def main(): multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed) multilingual_index.embedding_matrices(lpretrained, opt.supervised) if opt.posteriors: - lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000, store_posteriors=True) #stored_post=True) #opt.svm_max_docs) + if DEBUGGING: + import pickle + with open('/home/andreapdr/funneling_pdr/dumps/posteriors_jrc_run0.pickle', 'rb') as infile: + data_post = pickle.load(infile) + lPtr = data_post[0] + lPva = data_post[1] + lPte = data_post[2] + print('## DEBUGGING MODE: loaded dumped posteriors for jrc run0') + else: + lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000) else: lPtr, lPva, lPte = None, None, None - # just_test = False - # if just_test: - # - # model = torch.load( - # '../checkpoint/rnn(H512)-Muse-WCE-Posteriors-(trainable)-jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle') - # criterion = torch.nn.BCEWithLogitsLoss().cuda() - # - # # batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad()) - # - # batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad()) - # l_test_index = multilingual_index.l_test_index() - # epoch = 1 - # tinit = time() - # test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te') - # exit('Loaded') + if opt.mbert: + _dataset_path = opt.dataset.split('/')[-1].split('_') + _model_folder = _dataset_path[0] + '_' + _dataset_path[-1].replace('.pickle', '') + # print(f'Model Folder: {_model_folder}') + + if DEBUGGING: + with open('/home/andreapdr/funneling_pdr/dumps/mBert_jrc_run0.pickle', 'rb') as infile: + data_embed = pickle.load(infile) + tr_bert_embeddings = data_embed[0] + va_bert_embeddings = data_embed[1] + te_bert_embeddings = data_embed[2] + print('## DEBUGGING MODE: loaded dumped mBert embeddings for jrc run0') + else: + tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings \ + = multilingual_index.bert_embeddings(f'/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-{_model_folder}/') + else: + tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings = None, None, None # Model initialization model = init_Net(data.num_categories(), multilingual_index) @@ -141,11 +225,12 @@ def main(): criterion = torch.nn.BCEWithLogitsLoss().cuda() lr_scheduler = StepLR(optim, step_size=25, gamma=0.5) batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad()) - batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad()) + batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad()) tinit = time() create_if_not_exist(opt.checkpoint_dir) - early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}') + early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, + checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}') l_train_index, l_train_target = multilingual_index.l_train() l_val_index, l_val_target = multilingual_index.l_val() @@ -154,11 +239,11 @@ def main(): print('-'*80) print('Start training') for epoch in range(1, opt.nepochs + 1): - train(model, batcher_train, l_train_index, lPtr, l_train_target, tinit, logfile, criterion, optim, epoch, method_name) + train(model, batcher_train, l_train_index, lPtr, tr_bert_embeddings, l_train_target, tinit, logfile, criterion, optim, epoch, method_name) lr_scheduler.step() # reduces the learning rate # validation - macrof1 = test(model, batcher_eval, l_val_index, lPva, l_val_target, tinit, epoch, logfile, criterion, 'va') + macrof1 = test(model, batcher_eval, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, epoch, logfile, criterion, 'va') early_stop(macrof1, epoch) if opt.test_each>0: if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch 0: if (opt.plotmode and (epoch == 1 or epoch % opt.test_each == 0)) or ( not opt.plotmode and epoch % opt.test_each == 0 and epoch < opt.nepochs): - test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te') + test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None) if early_stop.STOP: print('[early-stop] STOP') @@ -323,16 +332,19 @@ def main(): print('Training over. Performing final evaluation') model = early_stop.restore_checkpoint() + model = model.cuda() if opt.val_epochs > 0: print(f'running last {opt.val_epochs} training epochs on the validation set') for val_epoch in range(1, opt.val_epochs + 1): - train(model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile) + train(model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile, writer=None) # final test print('Training complete: testing') - test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te') + test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None) + # writer.flush() + # writer.close() exit('Code Executed!') @@ -372,6 +384,7 @@ if __name__ == '__main__': # Testing different parameters ... opt.weight_decay = 0.01 opt.lr = 1e-5 + opt.patience = 5 main() # TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size diff --git a/src/main_mbert_extractor.py b/src/main_mbert_extractor.py new file mode 100644 index 0000000..f294fc5 --- /dev/null +++ b/src/main_mbert_extractor.py @@ -0,0 +1,110 @@ +from main_mbert import * +import pickle + + +class ExtractorDataset(Dataset): + """ + data: dict of lang specific tokenized data + labels: dict of lang specific targets + """ + + def __init__(self, data): + self.langs = data.keys() + self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} + + for i, lang in enumerate(self.langs): + _data = data[lang]['input_ids'] + _data = np.array(_data) + _lang_value = np.full(len(_data), self.lang_ids[lang]) + + if i == 0: + self.data = _data + self.lang_index = _lang_value + else: + self.data = np.vstack((self.data, _data)) + self.lang_index = np.concatenate((self.lang_index, _lang_value)) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + x = self.data[idx] + lang = self.lang_index[idx] + + return x, lang + + def get_lang_ids(self): + return self.lang_ids + + +def feature_extractor(data, lang_ids, model_path='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0/'): + print('# Feature Extractor Mode...') + from transformers import BertConfig + config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, num_labels=300) + model = BertForSequenceClassification.from_pretrained(model_path, + config=config).cuda() + + """ + Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for + the output of each layer) of shape (batch_size, sequence_length, hidden_size) + """ + all_batch_embeddings = {} + id2lang = {v:k for k,v in lang_ids.items()} + with torch.no_grad(): + for batch, target, lang_idx in data: + out = model(batch.cuda()) + last_hidden_state = out[1][-1] + batch_embeddings = last_hidden_state[:, 0, :] + for i, l_idx in enumerate(lang_idx.numpy()): + if id2lang[l_idx] not in all_batch_embeddings.keys(): + all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy() + else: + all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], + batch_embeddings[i].detach().cpu().numpy())) + + return all_batch_embeddings, id2lang + + +def main(): + print('Running main ...') + print(f'Model path: {opt.modelpath}\nDataset path: {opt.dataset}') + DATAPATH = opt.dataset + MAX_LEN = 512 + + l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH) + l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN) + l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN) + + tr_dataset = TrainingDataset(l_tokenized_tr, l_devel_target) + tr_lang_ids = tr_dataset.lang_ids + + te_dataset = TrainingDataset(l_tokenized_te, l_test_target) + te_lang_ids = te_dataset.lang_ids + + tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc embeddings + te_dataloader = DataLoader(te_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc + + tr_all_batch_embeddings, id2lang_tr = feature_extractor(tr_dataloader, tr_lang_ids, opt.modelpath) # Extracting doc embed for devel + with open(f'{opt.modelpath}/TR_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile: + pickle.dump((tr_all_batch_embeddings, id2lang_tr), outfile) + + te_all_batch_embeddings, id2lang_te = feature_extractor(te_dataloader, te_lang_ids, opt.modelpath) # Extracting doc embed for test + with open(f'{opt.modelpath}/TE_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile: + pickle.dump((te_all_batch_embeddings, id2lang_te), outfile) + + exit('Extraction completed!') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='mBert model document embedding extractor') + + parser.add_argument('--dataset', type=str, + default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle', + metavar='datasetpath', help=f'path to the pickled dataset') + parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') + parser.add_argument('--modelpath', type=str, default='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0', + metavar='modelpath', help=f'path to pre-trained mBert model') + opt = parser.parse_args() + + main() + diff --git a/src/main_multimodal_cls.py b/src/main_multimodal_cls.py index e5859b7..04ae86a 100644 --- a/src/main_multimodal_cls.py +++ b/src/main_multimodal_cls.py @@ -2,102 +2,41 @@ import os from dataset_builder import MultilingualDataset from learning.transformers import * from util.evaluation import * -from optparse import OptionParser from util.file import exists from util.results import PolylingualClassificationResults -from sklearn.svm import SVC - - -parser = OptionParser(usage="usage: %prog datapath [options]") - -parser.add_option("-o", "--output", dest="output", - help="Result file", type=str, default='multiModal_log.csv') - -parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true', - help="Add posterior probabilities to the document embedding representation", default=False) - -parser.add_option("-S", "--supervised", dest="supervised", action='store_true', - help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False) - -parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true', - help="Add pretrained MUSE embeddings to the document embedding representation", default=False) - -parser.add_option("--l2", dest="l2", action='store_true', - help="Activates l2 normalization as a post-processing for the document embedding views", default=False) - -parser.add_option("--allprob", dest="allprob", action='store_true', - help="All views are generated as posterior probabilities. This affects the supervised and pretrained " - "embeddings, for which a calibrated classifier is generated, which generates the posteriors", default=False) - -parser.add_option("--feat-weight", dest="feat_weight", - help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf') - -parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the MUSE polylingual word embeddings", default='../embeddings') - -parser.add_option("-s", "--set_c", dest="set_c",type=float, - help="Set the C parameter", default=1) - -parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimize hyperparameters", default=False) - -parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, - help="Number of parallel jobs (default is -1, all)", default=-1) - -parser.add_option("-p", "--pca", dest="max_labels_S", type=int, - help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", - default=300) - -parser.add_option("-r", "--remove-pc", dest="sif", action='store_true', - help="Remove common component when computing dot product of word embedding matrices", default=False) - -parser.add_option("-z", "--zscore", dest="zscore", action='store_true', - help="Z-score normalize matrices (WCE and MUSE)", default=False) - -parser.add_option("-a", "--agg", dest="agg", action='store_true', - help="Set aggregation function of the common Z-space to average (Default: concatenation)", default=False) - - - -def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto') - -def get_params(): - if not op.optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - - -####################################################################################################################### - +from util.common import * +from util.parser_options import * if __name__ == '__main__': (op, args) = parser.parse_args() - - assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)' - dataset = args[0] + dataset = op.dataset assert exists(dataset), 'Unable to find file '+str(dataset) assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' - assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed' - l2=op.l2 + assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \ + 'empty set of document embeddings is not allowed' + assert (op.gruWCE or op.gruMUSE) and op.gruViewGenerator, 'Initializing Gated Recurrent embedding layer without ' \ + 'explicit initialization of GRU View Generator' + l2 = op.l2 dataset_file = os.path.basename(dataset) - results = PolylingualClassificationResults('../log/' + op.output) - allprob='Prob' if op.allprob else '' - result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \ - f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}_zscore={op.zscore}{"_optimC" if op.optimc else ""}' - print(f'{result_id}') + allprob = 'Prob' if op.allprob else '' - # set zscore range - is slice(0,0) mean will be equal to 0 and std to 1, thus normalization will have no effect - standardize_range = slice(0,0) + # renaming arguments to be printed on log + method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert, + op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob) + print(f'Method: gFun{method_name}\nDataset: {dataset_name}') + print('-'*50) + exit() + + # set zscore range - is slice(0, 0) mean will be equal to 0 and std to 1, thus normalization will have no effect + standardize_range = slice(0, 0) if op.zscore: standardize_range = None + # load dataset data = MultilingualDataset.load(dataset) - # data.set_view(languages=['fr', 'it']) + # data.set_view(languages=['fr', 'it']) # TODO: DEBUG SETTING data.show_dimensions() lXtr, lytr = data.training() lXte, lyte = data.test() @@ -108,68 +47,96 @@ if __name__ == '__main__': # feature weighting (for word embeddings average) feat_weighting = FeatureWeight(op.feat_weight, agg='mean') - # # document embedding modules + # document embedding modules aka View Generators doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat') + + # init View Generators if op.posteriors: - doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2)) + """ + View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means + of a set of SVM. + """ + doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, + kernel='linear', + C=op.set_c), l2=l2)) + if op.supervised: + """ + View Generator (-W): generates document representation via Word-Class-Embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + """ wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, sif=op.sif) if op.allprob: - wce = FeatureSet2Posteriors(wce, l2=l2) + wce = FeatureSet2Posteriors(wce, requires_tfidf=True, l2=l2) doc_embedder.append(wce) + if op.pretrained: + """ + View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word + embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. + """ muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif) if op.allprob: - muse = FeatureSet2Posteriors(muse, l2=l2) + muse = FeatureSet2Posteriors(muse, requires_tfidf=True, l2=l2) doc_embedder.append(muse) + if op.gruViewGenerator: + """ + View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). Such + document embeddings are then casted into vectors of posterior probabilities via a set of SVM. + NB: --allprob won't have any effect on this View Gen since output is already encoded as post prob + """ + rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data, + options=op, model_path=op.gru_path) + doc_embedder.append(rnn_embedder) + + if op.mbert: + """ + View generator (-B): generates document embedding via mBERT model. + """ + mbert = MBertEmbedder(path_to_model=op.bert_path, + nC=data.num_categories()) + if op.allprob: + mbert = FeatureSet2Posteriors(mbert, l2=l2) + doc_embedder.append(mbert) + # metaclassifier meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] - meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=get_params(), standardize_range=standardize_range) + meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c), + meta_parameters=get_params(op.optimc), standardize_range=standardize_range) # ensembling the modules classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) - print('# Fitting ...') + print('\n# Fitting Funnelling Architecture...') + tinit = time.time() classifier.fit(lXtr, lytr) + time = time.time()-tinit print('\n# Evaluating ...') l_eval = evaluate_method(classifier, lXte, lyte) - # renaming arguments to be printed on log - _id = '' - _id_conf = [op.posteriors, op.supervised, op.pretrained] - _id_name = ['+P', '+W', '+M'] - for i, conf in enumerate(_id_conf): - if conf: - _id += _id_name[i] - _id = _id.lstrip('+') - _id = _id if not op.agg else _id + '_mean' - _id = _id if not op.allprob else _id + '_allprob' - - _dataset_path = dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - metrics = [] for lang in lXte.keys(): macrof1, microf1, macrok, microk = l_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') results.add_row(method='MultiModal', - learner='svm', + learner='SVM', optimp=op.optimc, - sif= op.sif, + sif=op.sif, zscore=op.zscore, - l2= op.l2, - wescaler= op.feat_weight, + l2=op.l2, + wescaler=op.feat_weight, pca=op.max_labels_S, - id=_id, - dataset=dataset_id, - time='todo', + id=method_name, + dataset=dataset_name, + time=time, lang=lang, macrof1=macrof1, microf1=microf1, macrok=macrok, microk=microk, notes='') - print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) + print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) diff --git a/src/main_qualitative_analysis.py b/src/main_qualitative_analysis.py new file mode 100644 index 0000000..aead994 --- /dev/null +++ b/src/main_qualitative_analysis.py @@ -0,0 +1,49 @@ +import os +from dataset_builder import MultilingualDataset +from optparse import OptionParser +from util.file import exists +import numpy as np +from sklearn.feature_extraction.text import CountVectorizer + +parser = OptionParser(usage="usage: %prog datapath [options]") + +(op, args) = parser.parse_args() +assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)' +dataset = args[0] +assert exists(dataset), 'Unable to find file '+str(dataset) + +dataset_file = os.path.basename(dataset) + +data = MultilingualDataset.load(dataset) +data.set_view(languages=['it']) +data.show_dimensions() +lXtr, lytr = data.training() +lXte, lyte = data.test() + +vect_lXtr = dict() +vectorizer = CountVectorizer() +vect_lXtr['it'] = vectorizer.fit_transform(lXtr['it']) +# print(type(vect_lXtr['it'])) + +corr = vect_lXtr['it'].T.dot(lytr['it']) +# print(corr.shape) +sum_correlated_class = corr.sum(axis=0) +print(len(sum_correlated_class)) +print(sum_correlated_class.max()) + + +w2idx = vectorizer.vocabulary_ +idx2w = {v:k for k,v in w2idx.items()} + +word_tot_corr = corr.sum(axis=1) +print(word_tot_corr.shape) +dict_word_tot_corr = {v:k for k,v in enumerate(word_tot_corr)} + +sorted_word_tot_corr = np.sort(word_tot_corr) +sorted_word_tot_corr = sorted_word_tot_corr[len(sorted_word_tot_corr)-200:] + +top_idx = [dict_word_tot_corr[k] for k in sorted_word_tot_corr] +print([idx2w[idx] for idx in top_idx]) +print([elem for elem in top_idx]) +print(corr[8709]) +print('Finished...') \ No newline at end of file diff --git a/src/models/lstm_class.py b/src/models/lstm_class.py index 727f3ce..98424f1 100755 --- a/src/models/lstm_class.py +++ b/src/models/lstm_class.py @@ -8,7 +8,8 @@ from models.helpers import * class RNNMultilingualClassifier(nn.Module): def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None, - drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False): + drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False, + bert_embeddings=False): super(RNNMultilingualClassifier, self).__init__() self.output_size = output_size @@ -16,6 +17,7 @@ class RNNMultilingualClassifier(nn.Module): self.drop_embedding_range = drop_embedding_range self.drop_embedding_prop = drop_embedding_prop self.post_probabilities = post_probabilities + self.bert_embeddings = bert_embeddings assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' self.lpretrained_embeddings = nn.ModuleDict() @@ -56,19 +58,24 @@ class RNNMultilingualClassifier(nn.Module): if only_post: self.label = nn.Linear(output_size, output_size) - elif post_probabilities: - self.label = nn.Linear(ff2+output_size, output_size) + elif post_probabilities and not bert_embeddings: + self.label = nn.Linear(ff2 + output_size, output_size) + elif bert_embeddings and not post_probabilities: + self.label = nn.Linear(ff2 + 768, output_size) + elif post_probabilities and bert_embeddings: + self.label = nn.Linear(ff2 + output_size + 768, output_size) else: self.label = nn.Linear(ff2, output_size) - - def forward(self, input, post, lang): + def forward(self, input, post, bert_embed, lang): if self.only_post: doc_embedding = post else: doc_embedding = self.transform(input, lang) if self.post_probabilities: doc_embedding = torch.cat([doc_embedding, post], dim=1) + if self.bert_embeddings: + doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1) logits = self.label(doc_embedding) return logits @@ -83,7 +90,7 @@ class RNNMultilingualClassifier(nn.Module): # c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) # output, (_, _) = self.lstm(input, (h_0, c_0)) output, _ = self.rnn(input, h_0) - output = output[-1,:,:] + output = output[-1, :, :] output = F.relu(self.linear0(output)) output = self.dropout(F.relu(self.linear1(output))) output = self.dropout(F.relu(self.linear2(output))) @@ -94,3 +101,14 @@ class RNNMultilingualClassifier(nn.Module): self.lpretrained_embeddings[l].requires_grad = True self.lpretrained_embeddings[l].weight.requires_grad = True + def get_embeddings(self, input, lang): + batch_size = input.shape[0] + input = embed(self, input, lang) + input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + input = input.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda()) + output, _ = self.rnn(input, h_0) + output = output[-1, :, :] + return output.cpu().detach().numpy() + diff --git a/src/models/mBert.py b/src/models/mBert.py new file mode 100644 index 0000000..e06746c --- /dev/null +++ b/src/models/mBert.py @@ -0,0 +1,249 @@ +from copy import deepcopy +import torch +from torch.utils.data import Dataset +from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig +from sklearn.model_selection import train_test_split +from util.evaluation import * +from time import time + + +def predict(logits, classification_type='multilabel'): + if classification_type == 'multilabel': + prediction = torch.sigmoid(logits) > 0.5 + elif classification_type == 'singlelabel': + prediction = torch.argmax(logits, dim=1).view(-1, 1) + else: + print('unknown classification type') + + return prediction.detach().cpu().numpy() + + +class TrainingDataset(Dataset): + """ + data: dict of lang specific tokenized data + labels: dict of lang specific targets + """ + + def __init__(self, data, labels): + self.langs = data.keys() + self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} + + for i, lang in enumerate(self.langs): + _data = data[lang]['input_ids'] + _data = np.array(_data) + _labels = labels[lang] + _lang_value = np.full(len(_data), self.lang_ids[lang]) + + if i == 0: + self.data = _data + self.labels = _labels + self.lang_index = _lang_value + else: + self.data = np.vstack((self.data, _data)) + self.labels = np.vstack((self.labels, _labels)) + self.lang_index = np.concatenate((self.lang_index, _lang_value)) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + x = self.data[idx] + y = self.labels[idx] + lang = self.lang_index[idx] + + return x, torch.tensor(y, dtype=torch.float), lang + + def get_lang_ids(self): + return self.lang_ids + + def get_nclasses(self): + if hasattr(self, 'labels'): + return len(self.labels[0]) + else: + print('Method called before init!') + + +class ExtractorDataset(Dataset): + """ + data: dict of lang specific tokenized data + labels: dict of lang specific targets + """ + + def __init__(self, data): + self.langs = data.keys() + self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} + + for i, lang in enumerate(self.langs): + _data = data[lang]['input_ids'] + _data = np.array(_data) + _lang_value = np.full(len(_data), self.lang_ids[lang]) + + if i == 0: + self.data = _data + self.lang_index = _lang_value + else: + self.data = np.vstack((self.data, _data)) + self.lang_index = np.concatenate((self.lang_index, _lang_value)) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + x = self.data[idx] + lang = self.lang_index[idx] + + return x, lang + + def get_lang_ids(self): + return self.lang_ids + + +def get_model(n_out): + print('# Initializing model ...') + model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out) + return model + + +def init_optimizer(model, lr, weight_decay=0): + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in model.named_parameters() + if not any(nd in n for nd in no_decay)], + 'weight_decay': weight_decay}, + {'params': [p for n, p in model.named_parameters() + if any(nd in n for nd in no_decay)], + 'weight_decay': weight_decay} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=lr) + return optimizer + + +def get_lr(optimizer): + for param_group in optimizer.param_groups: + return param_group['lr'] + + +def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed): + l_split_va = deepcopy(l_tokenized_tr) + l_split_val_target = {l: [] for l in l_tokenized_tr.keys()} + l_split_tr = deepcopy(l_tokenized_tr) + l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()} + + for lang in l_tokenized_tr.keys(): + val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val)) + l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[ + lang] = \ + train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, + random_state=seed, shuffle=True) + + return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target + + +def do_tokenization(l_dataset, max_len=512, verbose=True): + if verbose: + print('# Starting Tokenization ...') + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + langs = l_dataset.keys() + l_tokenized = {} + for lang in langs: + l_tokenized[lang] = tokenizer(l_dataset[lang], + truncation=True, + max_length=max_len, + padding='max_length') + return l_tokenized + + +def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10): + # _dataset_path = opt.dataset.split('/')[-1].split('_') + # dataset_id = _dataset_path[0] + _dataset_path[-1] + dataset_id = 'TODO fix this!' + + loss_history = [] + model.train() + + for idx, (batch, target, lang_idx) in enumerate(train_dataloader): + optim.zero_grad() + out = model(batch.cuda()) + logits = out[0] + loss = criterion(logits, target.cuda()) + loss.backward() + # clip_gradient(model) + optim.step() + loss_history.append(loss.item()) + + if idx % log_interval == 0: + interval_loss = np.mean(loss_history[log_interval:]) + print( + f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') + + mean_loss = np.mean(interval_loss) + logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) + return mean_loss + + +def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix): + print('# Validating model ...') + loss_history = [] + model.eval() + langs = lang_ids.keys() + id_2_lang = {v: k for k, v in lang_ids.items()} + predictions = {l: [] for l in langs} + yte_stacked = {l: [] for l in langs} + + for batch, target, lang_idx in test_dataloader: + out = model(batch.cuda()) + logits = out[0] + loss = criterion(logits, target.cuda()).item() + prediction = predict(logits) + loss_history.append(loss) + + # Assigning prediction to dict in predictions and yte_stacked according to lang_idx + for i, pred in enumerate(prediction): + lang_pred = id_2_lang[lang_idx.numpy()[i]] + predictions[lang_pred].append(pred) + yte_stacked[lang_pred].append(target[i].detach().cpu().numpy()) + + ly = {l: np.vstack(yte_stacked[l]) for l in langs} + ly_ = {l: np.vstack(predictions[l]) for l in langs} + l_eval = evaluate(ly, ly_) + metrics = [] + for lang in langs: + macrof1, microf1, macrok, microk = l_eval[lang] + metrics.append([macrof1, microf1, macrok, microk]) + if measure_prefix == 'te': + print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') + Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) + print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') + + mean_loss = np.mean(loss_history) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) + logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) + + return Mf1 + + +def feature_extractor(data, lang_ids, model): + print('# Feature Extractor Mode...') + """ + Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for + the output of each layer) of shape (batch_size, sequence_length, hidden_size) + """ + all_batch_embeddings = {} + id2lang = {v: k for k, v in lang_ids.items()} + with torch.no_grad(): + for batch, lang_idx in data: + # for batch, target, lang_idx in data: + out = model(batch.cuda()) + last_hidden_state = out[1][-1] + batch_embeddings = last_hidden_state[:, 0, :] + for i, l_idx in enumerate(lang_idx.numpy()): + if id2lang[l_idx] not in all_batch_embeddings.keys(): + all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy() + else: + all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], + batch_embeddings[i].detach().cpu().numpy())) + + return all_batch_embeddings, id2lang diff --git a/src/run_fun_bert_jrc.sh b/src/run_fun_bert_jrc.sh new file mode 100644 index 0000000..fc2e2c3 --- /dev/null +++ b/src/run_fun_bert_jrc.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run +#logfile=../log/log_FunBert_jrc.csv +# +#runs='0 1 2 3 4' +#for run in $runs +#do +# dataset=$dataset_path$run.pickle +# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile #--tunable +#done + +dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle +logfile=../log/log_FunBert_fulljrc_static.csv + +python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile \ No newline at end of file diff --git a/src/run_fun_bert_rcv.sh b/src/run_fun_bert_rcv.sh new file mode 100644 index 0000000..e27fe54 --- /dev/null +++ b/src/run_fun_bert_rcv.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run +#logfile=../log/log_FunBert_rcv_static.csv +# +#runs='0 1 2 3 4' +#for run in $runs +#do +# dataset=$dataset_path$run.pickle +# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile +#done + +dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle +logfile=../log/log_FunBert_fullrcv_static.csv + +python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile \ No newline at end of file diff --git a/src/run_mbert_jrc.sh b/src/run_mbert_jrc.sh new file mode 100644 index 0000000..08733a4 --- /dev/null +++ b/src/run_mbert_jrc.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run +#logfile=../log/log_mBert_jrc_NEW.csv +# +#runs='0 1 2 3 4' +#for run in $runs +#do +# dataset=$dataset_path$run.pickle +# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50 +#done + +logfile=../log/log_mBert_fulljrc.csv +dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle +python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50 \ No newline at end of file diff --git a/src/run_mbert_rcv.sh b/src/run_mbert_rcv.sh index 8da53e0..66ffba1 100644 --- a/src/run_mbert_rcv.sh +++ b/src/run_mbert_rcv.sh @@ -1,11 +1,15 @@ #!/usr/bin/env bash -dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -logfile=../log/log_Mbert_rcv.csv +#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run +#logfile=../log/log_mBert_rcv_NEW.csv +# +#runs='0 1 2 3 4' +#for run in $runs +#do +# dataset=$dataset_path$run.pickle +# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50 +#done -runs='1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - python new_mbert.py --dataset $dataset --log-file $logfile --nepochs=5 --weight_decay=0.01 --lr=1e-5 -done +logfile=../log/log_mBert_fullrcv.csv +dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle +python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=30 --patience 3 \ No newline at end of file diff --git a/src/util/common.py b/src/util/common.py index 8a9a880..b6b93dc 100755 --- a/src/util/common.py +++ b/src/util/common.py @@ -1,15 +1,14 @@ +import subprocess import warnings -import time -from sklearn.svm import SVC from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.svm import SVC from sklearn.model_selection import train_test_split from embeddings.supervised import get_supervised_embeddings -from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual -warnings.filterwarnings("ignore", category=DeprecationWarning) +# from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual import numpy as np from tqdm import tqdm import torch -from scipy.sparse import vstack, issparse +warnings.filterwarnings("ignore", category=DeprecationWarning) def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): @@ -161,12 +160,13 @@ class Index: def none_dict(langs): return {l:None for l in langs} + class MultilingualIndex: def __init__(self): #, add_language_trace=False): self.l_index = {} self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) # self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000) - # self.add_language_trace=add_language_trace + # self.add_language_trace=add_language_trace} def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary): self.langs = sorted(l_devel_raw.keys()) @@ -184,6 +184,8 @@ class MultilingualIndex: for l,index in self.l_index.items(): index.train_val_split(val_prop, max_val, seed=seed) + + def embedding_matrices(self, lpretrained, supervised): lXtr = self.get_lXtr() if supervised else none_dict(self.langs) lYtr = self.l_train_target() if supervised else none_dict(self.langs) @@ -191,52 +193,133 @@ class MultilingualIndex: index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l]) self.sup_range = index.wce_range - # experimental... does it make sense to keep track of the language? i.e., to inform the network from which - # language does the data came from... - # if self.add_language_trace and pretrained_embeddings is not None: - # print('adding language trace') - # lang_trace = torch.zeros(size=(vocabsize, len(self.langs))) - # lang_trace[:,i]=1 - # pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1) + # TODO circular import with transformers --> when generating posterior prob, we import PosteriorProbabilitiesEmbedder which is defined in transformers + # def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False): + # # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs + # timeit = time.time() + # lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()} + # lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()} + # if not stored_post: + # for l in self.langs: + # n_elements = lXtr[l].shape[0] + # if n_elements > max_training_docs_by_lang: + # choice = np.random.permutation(n_elements)[:max_training_docs_by_lang] + # lXtr[l] = lXtr[l][choice] + # lYtr[l] = lYtr[l][choice] + # + # # train the posterior probabilities embedder + # print('[posteriors] training a calibrated SVM') + # learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto') + # prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False) + # prob_embedder.fit(lXtr, lYtr) + # + # # transforms the training, validation, and test sets into posterior probabilities + # print('[posteriors] generating posterior probabilities') + # lPtr = prob_embedder.transform(self.get_lXtr()) + # lPva = prob_embedder.transform(self.get_lXva()) + # lPte = prob_embedder.transform(self.get_lXte()) + # # NB: Check splits indices ! + # if store_posteriors: + # import pickle + # with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile: + # pickle.dump([lPtr, lPva, lPte], outfile) + # print(f'Successfully dumped posteriors!') + # else: + # import pickle + # with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile: + # lPtr, lPva, lPte = pickle.load(infile) + # print(f'Successfully loaded stored posteriors!') + # print(f'[posteriors] done in {time.time() - timeit}') + # return lPtr, lPva, lPte + + def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False): + show_gpu('GPU memory before initializing mBert model:') + # TODO: load dumped embeddings? + from main_mbert_extractor import do_tokenization, ExtractorDataset, DataLoader + from transformers import BertConfig, BertForSequenceClassification + + print('[mBERT] generating mBERT doc embeddings') + lXtr_raw = self.get_raw_lXtr() + lXva_raw = self.get_raw_lXva() + lXte_raw = self.get_raw_lXte() + + print('# Tokenizing datasets') + l_tokenized_tr = do_tokenization(lXtr_raw, max_len=max_len, verbose=False) + tr_dataset = ExtractorDataset(l_tokenized_tr) + tr_lang_ids = tr_dataset.lang_ids + tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=False) + + l_tokenized_va = do_tokenization(lXva_raw, max_len=max_len, verbose=False) + va_dataset = ExtractorDataset(l_tokenized_va) + va_lang_ids = va_dataset.lang_ids + va_dataloader = DataLoader(va_dataset, batch_size=batch_size, shuffle=False) + + l_tokenized_te = do_tokenization(lXte_raw, max_len=max_len, verbose=False) + te_dataset = ExtractorDataset(l_tokenized_te) + te_lang_ids = te_dataset.lang_ids + te_dataloader = DataLoader(te_dataset, batch_size=batch_size, shuffle=False) + + num_labels = self.l_index[self.langs[0]].val_target.shape[1] + config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, + num_labels=num_labels) + model = BertForSequenceClassification.from_pretrained(bert_path, + config=config).cuda() + print('# Extracting document embeddings') + tr_bert_embeddings, id2lang_tr = self.do_bert_embeddings(model, tr_dataloader, tr_lang_ids, verbose=False) + va_bert_embeddings, id2lang_va = self.do_bert_embeddings(model, va_dataloader, va_lang_ids, verbose=False) + te_bert_embeddings, id2lang_te = self.do_bert_embeddings(model, te_dataloader, te_lang_ids, verbose=False) + + show_gpu('GPU memory before after mBert model:') + # Freeing GPU's memory + import gc + del model, tr_dataloader, va_dataloader, te_dataloader + gc.collect() + torch.cuda.empty_cache() + show_gpu('GPU memory after clearing cache:') + return tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings - def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False): - # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs - timeit = time.time() - lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()} - lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()} - if not stored_post: - for l in self.langs: - n_elements = lXtr[l].shape[0] - if n_elements > max_training_docs_by_lang: - choice = np.random.permutation(n_elements)[:max_training_docs_by_lang] - lXtr[l] = lXtr[l][choice] - lYtr[l] = lYtr[l][choice] + @staticmethod + def do_bert_embeddings(model, data, lang_ids, verbose=True): + if verbose: + print('# Feature Extractor Mode...') + all_batch_embeddings = {} + id2lang = {v: k for k, v in lang_ids.items()} + with torch.no_grad(): + for batch, lang_idx in data: + out = model(batch.cuda()) + last_hidden_state = out[1][-1] + batch_embeddings = last_hidden_state[:, 0, :] + for i, l_idx in enumerate(lang_idx.numpy()): + if id2lang[l_idx] not in all_batch_embeddings.keys(): + all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy() + else: + all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], + batch_embeddings[i].detach().cpu().numpy())) - # train the posterior probabilities embedder - print('[posteriors] training a calibrated SVM') - learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto') - prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False) - prob_embedder.fit(lXtr, lYtr) + return all_batch_embeddings, id2lang - # transforms the training, validation, and test sets into posterior probabilities - print('[posteriors] generating posterior probabilities') - lPtr = prob_embedder.transform(self.get_lXtr()) - lPva = prob_embedder.transform(self.get_lXva()) - lPte = prob_embedder.transform(self.get_lXte()) - # NB: Check splits indices ! - if store_posteriors: - import pickle - with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile: - pickle.dump([lPtr, lPva, lPte], outfile) - print(f'Successfully dumped posteriors!') - else: - import pickle - with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile: - lPtr, lPva, lPte = pickle.load(infile) - print(f'Successfully loaded stored posteriors!') - print(f'[posteriors] done in {time.time() - timeit}') - return lPtr, lPva, lPte + def get_raw_lXtr(self): + lXtr_raw = {k:[] for k in self.langs} + lYtr_raw = {k: [] for k in self.langs} + for lang in self.langs: + lXtr_raw[lang] = self.l_index[lang].train_raw + lYtr_raw[lang] = self.l_index[lang].train_raw + return lXtr_raw + + def get_raw_lXva(self): + lXva_raw = {k: [] for k in self.langs} + for lang in self.langs: + lXva_raw[lang] = self.l_index[lang].val_raw + + return lXva_raw + + def get_raw_lXte(self): + lXte_raw = {k: [] for k in self.langs} + for lang in self.langs: + lXte_raw[lang] = self.l_index[lang].test_raw + + return lXte_raw def get_lXtr(self): if not hasattr(self, 'lXtr'): @@ -277,6 +360,12 @@ class MultilingualIndex: def l_test_index(self): return {l: index.test_index for l, index in self.l_index.items()} + def l_devel_index(self): + return {l: index.devel_index for l, index in self.l_index.items()} + + def l_devel_target(self): + return {l: index.devel_target for l, index in self.l_index.items()} + def l_train(self): return self.l_train_index(), self.l_train_target() @@ -284,7 +373,6 @@ class MultilingualIndex: return self.l_val_index(), self.l_val_target() - class Batch: def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500): self.batchsize = batchsize @@ -297,7 +385,7 @@ class Batch: def init_offset(self): self.offset = {lang: 0 for lang in self.languages} - def batchify(self, l_index, l_post, llabels): + def batchify(self, l_index, l_post, l_bert, llabels): # TODO: add bert embedding here... langs = self.languages l_num_samples = {l:len(l_index[l]) for l in langs} @@ -322,6 +410,10 @@ class Batch: if l_post is not None: post = torch.FloatTensor(l_post[lang][batch_slice]).cuda() + bert_emb = None + if l_bert is not None: + bert_emb = torch.FloatTensor(l_bert[lang][batch_slice]).cuda() + batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length) batch = torch.LongTensor(batch).cuda() @@ -329,7 +421,7 @@ class Batch: self.offset[lang] = limit - yield batch, post, target, lang + yield batch, post, bert_emb, target, lang def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500): @@ -384,7 +476,81 @@ def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) +def show_gpu(msg): + """ + ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4 + """ + + def query(field): + return (subprocess.check_output( + ['nvidia-smi', f'--query-gpu={field}', + '--format=csv,nounits,noheader'], + encoding='utf-8')) + + def to_int(result): + return int(result.strip().split('\n')[0]) + + used = to_int(query('memory.used')) + total = to_int(query('memory.total')) + pct = used / total + print('\n' + msg, f'{100 * pct:2.1f}% ({used} out of {total})') +class TfidfVectorizerMultilingual: + + def __init__(self, **kwargs): + self.kwargs = kwargs + + def fit(self, lX, ly=None): + self.langs = sorted(lX.keys()) + self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} + return self + + def transform(self, lX): + return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} + + def fit_transform(self, lX, ly=None): + return self.fit(lX, ly).transform(lX) + + def vocabulary(self, l=None): + if l is None: + return {l: self.vectorizer[l].vocabulary_ for l in self.langs} + else: + return self.vectorizer[l].vocabulary_ + + def get_analyzer(self, l=None): + if l is None: + return {l: self.vectorizer[l].build_analyzer() for l in self.langs} + else: + return self.vectorizer[l].build_analyzer() +def get_learner(calibrate=False, kernel='linear', C=1): + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False) + + +def get_params(optimc=False): + if not optimc: + return None + c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] + kernel = 'rbf' + return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] + + +def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru, + gruMUSE, gruWCE, agg, allprob): + _id = '-' + _id_conf = [posteriors, supervised, pretrained, mbert, gru] + _id_name = ['X', 'W', 'M', 'B', 'G'] + for i, conf in enumerate(_id_conf): + if conf: + _id += _id_name[i] + _id = _id if not gruMUSE else _id + '_muse' + _id = _id if not gruWCE else _id + '_wce' + _id = _id if not agg else _id + '_mean' + _id = _id if not allprob else _id + '_allprob' + + _dataset_path = dataset.split('/')[-1].split('_') + dataset_id = _dataset_path[0] + _dataset_path[-1] + return _id, dataset_id + diff --git a/src/util/early_stop.py b/src/util/early_stop.py index f2e85aa..7d72cde 100755 --- a/src/util/early_stop.py +++ b/src/util/early_stop.py @@ -1,12 +1,13 @@ #adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py import torch +from transformers import BertForSequenceClassification from time import time from util.file import create_if_not_exist import warnings class EarlyStopping: - def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt'): + def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt', is_bert=False): # set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters self.patience_limit = patience self.patience = patience @@ -18,6 +19,7 @@ class EarlyStopping: self.model = model self.optimizer = optimizer self.STOP = False + self.is_bert = is_bert def __call__(self, watch_score, epoch): @@ -30,12 +32,17 @@ class EarlyStopping: self.stop_time = time() if self.checkpoint: self.print(f'[early-stop] improved, saving model in {self.checkpoint}') - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - torch.save(self.model, self.checkpoint) - # with open(self.checkpoint) - # torch.save({'state_dict': self.model.state_dict(), - # 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint) + if self.is_bert: + print(f'Serializing Huggingface model...') + create_if_not_exist(self.checkpoint) + self.model.save_pretrained(self.checkpoint) + else: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + torch.save(self.model, self.checkpoint) + # with open(self.checkpoint) + # torch.save({'state_dict': self.model.state_dict(), + # 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint) else: self.print(f'[early-stop] improved') self.patience = self.patience_limit @@ -54,7 +61,10 @@ class EarlyStopping: def restore_checkpoint(self): print(f'restoring best model from epoch {self.best_epoch}...') - return torch.load(self.checkpoint) + if self.is_bert: + return BertForSequenceClassification.from_pretrained(self.checkpoint) + else: + return torch.load(self.checkpoint) def print(self, msg): if self.verbose: diff --git a/src/util/evaluation.py b/src/util/evaluation.py index 742ba7b..a4aac5c 100644 --- a/src/util/evaluation.py +++ b/src/util/evaluation.py @@ -5,18 +5,21 @@ from sklearn.metrics import f1_score import numpy as np import time + def evaluation_metrics(y, y_): if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_) + def soft_evaluation_metrics(y, y_): if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_) + def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1): print('evaluation (n_jobs={})'.format(n_jobs)) if n_jobs == 1: @@ -26,6 +29,7 @@ def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1): evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs) return {lang: evals[i] for i, lang in enumerate(langs)} + def average_results(l_eval, show=True): metrics = [] for lang in l_eval.keys(): @@ -60,6 +64,7 @@ def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, retu else: return eval_ + def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False): print('prediction for test in a single language') if predictor is None: @@ -72,6 +77,7 @@ def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=Fa ly_ = predictor({lang:X}) return metrics(y, ly_[lang]) + def get_binary_counters(polylingual_method, lX, ly, predictor=None): print('prediction for test') assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate' @@ -87,6 +93,7 @@ def get_binary_counters(polylingual_method, lX, ly, predictor=None): evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs) return {lang: evals[i] for i, lang in enumerate(langs)} + def binary_counters(y, y_): y = np.reshape(y, (-1)) assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected' diff --git a/src/util/parser_options.py b/src/util/parser_options.py new file mode 100644 index 0000000..0e751bd --- /dev/null +++ b/src/util/parser_options.py @@ -0,0 +1,91 @@ +from optparse import OptionParser + +parser = OptionParser(usage="usage: %prog datapath [options]") + +parser.add_option("-d", dest='dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset') + +parser.add_option("-o", "--output", dest="output", + help="Result file", type=str, default='../log/multiModal_log.csv') + +parser.add_option("-X", "--posteriors", dest="posteriors", action='store_true', + help="Add posterior probabilities to the document embedding representation", default=False) + +parser.add_option("-W", "--supervised", dest="supervised", action='store_true', + help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False) + +parser.add_option("-M", "--pretrained", dest="pretrained", action='store_true', + help="Add pretrained MUSE embeddings to the document embedding representation", default=False) + +parser.add_option("-B", "--mbert", dest="mbert", action='store_true', + help="Add multilingual Bert (mBert) document embedding representation", default=False) + +parser.add_option('-G', dest='gruViewGenerator', action='store_true', + help="Add document embedding generated via recurrent net (GRU)", default=False) + +parser.add_option("--l2", dest="l2", action='store_true', + help="Activates l2 normalization as a post-processing for the document embedding views", + default=False) + +parser.add_option("--allprob", dest="allprob", action='store_true', + help="All views are generated as posterior probabilities. This affects the supervised and pretrained" + "embeddings, for which a calibrated classifier is generated, which generates the posteriors", + default=False) + +parser.add_option("--feat-weight", dest="feat_weight", + help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf') + +parser.add_option("-w", "--we-path", dest="we_path", + help="Path to the MUSE polylingual word embeddings", default='../embeddings') + +parser.add_option("-s", "--set_c", dest="set_c", type=float, + help="Set the C parameter", default=1) + +parser.add_option("-c", "--optimc", dest="optimc", action='store_true', + help="Optimize hyperparameters", default=False) + +parser.add_option("-j", "--n_jobs", dest="n_jobs", type=int, + help="Number of parallel jobs (default is -1, all)", default=-1) + +parser.add_option("-p", "--pca", dest="max_labels_S", type=int, + help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", + default=300) + +parser.add_option("-r", "--remove-pc", dest="sif", action='store_true', + help="Remove common component when computing dot product of word embedding matrices", default=False) + +parser.add_option("-z", "--zscore", dest="zscore", action='store_true', + help="Z-score normalize matrices (WCE and MUSE)", default=False) + +parser.add_option("-a", "--agg", dest="agg", action='store_true', + help="Set aggregation function of the common Z-space to average (Default: concatenation)", + default=False) + +# ------------------------------------------------------------------------------------ + +parser.add_option('--hidden', type=int, default=512, metavar='int', + help='hidden lstm size (default: 512)') + +parser.add_option('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', + help='dropout probability for the supervised matrix (default: 0.5)') + +parser.add_option('--tunable', action='store_true', default=False, + help='pretrained embeddings are tunable from the beginning (default False, i.e., static)') + +parser.add_option('--logfile_gru', dest='logfile_gru', default='../log/log_gru_viewgenerator.csv') + +parser.add_option('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') + +parser.add_option('--force', action='store_true', default=False, + help='do not check if this experiment has already been run') + +parser.add_option('--gruMuse', dest='gruMUSE', action='store_true', default=False, + help='Deploy MUSE embedding as embedding layer of the GRU View Generator') + +parser.add_option('--gruWce', dest='gruWCE', action='store_true', default=False, + help='Deploy WCE embedding as embedding layer of the GRU View Generator') + +parser.add_option('--gru-path', dest='gru_path', default=None, + help='Set the path to a pretrained GRU model (aka, -G view generator)') + +parser.add_option('--bert-path', dest='bert_path', default=None, + help='Set the path to a pretrained mBERT model (aka, -B view generator)') diff --git a/src/util_transformers/StandardizeTransformer.py b/src/util_transformers/StandardizeTransformer.py index a46ffb6..06e633e 100644 --- a/src/util_transformers/StandardizeTransformer.py +++ b/src/util_transformers/StandardizeTransformer.py @@ -9,7 +9,7 @@ class StandardizeTransformer: self.range = range def fit(self, X): - print('fitting Standardizer') + print('fitting Standardizer...') std=np.std(X, axis=self.axis, ddof=1) self.std = np.clip(std, 1e-5, None) self.mean = np.mean(X, axis=self.axis) @@ -21,7 +21,6 @@ class StandardizeTransformer: self.std = ones self.mean = zeros self.yetfit=True - print('done\n') return self def transform(self, X):