From a594a84dab190da2931444d17a56e7e4fa8cbaaa Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 27 Jul 2020 12:00:33 +0200 Subject: [PATCH] baseline multilingual Bert --- src/learning/learners.py | 156 --------------------------------------- 1 file changed, 156 deletions(-) diff --git a/src/learning/learners.py b/src/learning/learners.py index 0559416..8d82b48 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -1,15 +1,9 @@ import numpy as np import time -# from embeddings.embeddings import WordEmbeddings, StorageEmbeddings from scipy.sparse import issparse from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import GridSearchCV -# from sklearn.model_selection import KFold from joblib import Parallel, delayed -# from sklearn.feature_extraction.text import TfidfVectorizer -# from util_transformers.StandardizeTransformer import StandardizeTransformer -# from sklearn.decomposition import PCA -# from models.cnn_class_bu import CNN_pdr def _sort_if_sparse(X): @@ -40,156 +34,6 @@ class TrivialRejector: def best_params(self): return {} -# class FunnellingPolylingualClassifier: -# """ -# This classifier projects each document d into a language-independent feature space where each dimension fi is the -# decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l; -# then trains one single classifier for all documents in this space, irrespective of their originary language -# """ -# def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1, -# calmode='cal', n_jobs=-1): -# """ -# :param first_tier_learner: the learner used in the first-tier level -# :param meta_learner: the learner used in the second-tier level -# :param first_tier_parameters: parameters for the learner in the doc_projector -# :param meta_parameters: parameters for the learner in the z-space -# :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and -# :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or -# :param n_jobs: number of parallel threads -# 'sigmoid' to use the sigmoid of the decision_function -# projects the data before training the final classifier; if greater than one, the training set is split in as -# many folds as indicated, and the projected space is composed by concatenating each fold prediction based on -# models trained on the remaining folds. This should increase the generality of the space to unseen data. -# """ -# assert folded_projections>0, "positive number of folds expected" -# assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode' -# assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True' -# -# self.fist_tier_learner = first_tier_learner -# self.meta_learner = meta_learner -# self.fist_tier_parameters=first_tier_parameters -# self.meta_parameters = meta_parameters -# self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs) -# self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs) -# self.folded_projections = folded_projections -# self.n_jobs = n_jobs -# self.calmode = calmode -# -# def _projection(self, doc_projector, lX): -# """ -# Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or -# decision_function if otherwise -# :param doc_projector: the document projector (a NaivePolylingualClassifier) -# :param lX: {lang:matrix} to train -# :return: the projection, applied with predict_proba or decision_function -# """ -# if self.calmode=='cal': -# return doc_projector.predict_proba(lX) -# else: -# l_decision_scores = doc_projector.decision_function(lX) -# if self.calmode=='sigmoid': -# def sigmoid(x): return 1 / (1 + np.exp(-x)) -# for lang in l_decision_scores.keys(): -# l_decision_scores[lang] = sigmoid(l_decision_scores[lang]) -# return l_decision_scores -# -# def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None): -# """ -# Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of -# decision scores (if otherwise). This space is here named zspace. -# :param lXtr: {lang:matrix} to train -# :param lYtr: {lang:labels} to train -# :param lXproj: {lang:matrix} to project (if None, then projects the lXtr) -# :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked) -# :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific -# models trained on lXtr, and the lYproj labels stacked consistently -# """ -# repair_empty_folds = True -# if lXproj is None and lYproj is None: -# lXproj, lYproj = lXtr, lYtr -# repair_empty_folds = False -# -# print('fitting the projectors... {}'.format(lXtr.keys())) -# self.doc_projector.fit(lXtr, lYtr) -# -# print('projecting the documents') -# langs = list(lXtr.keys()) -# lZ = self._projection(self.doc_projector, lXproj) -# -# # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version -# empty_categories = self.doc_projector.empty_categories -# lZ_bu = self._projection(self.doc_projector_bu, lXproj) -# -# for lang in langs: -# repair = empty_categories[lang] -# lZ[lang][:,repair] = lZ_bu[lang][:,repair] -# -# Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space -# zy = np.vstack([lYproj[lang] for lang in langs]) -# return Z, zy -# -# def _get_zspace_folds(self, lX, ly): -# self.doc_projector_bu.fit(lX, ly) -# -# print('split of {} folds'.format(self.folded_projections)) -# skf = KFold(n_splits=self.folded_projections, shuffle=True) -# -# Z, zy = [], [] -# lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()} -# for fold in range(self.folded_projections): -# print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections)) -# lfoldXtr, lfoldYtr = {}, {} -# lfoldXte, lfoldYte = {}, {} -# for lang in lX.keys(): -# train, test = lfold[lang][fold] -# lfoldXtr[lang] = lX[lang][train] -# lfoldYtr[lang] = ly[lang][train] -# lfoldXte[lang] = lX[lang][test] -# lfoldYte[lang] = ly[lang][test] -# Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte) -# Z.append(Zfold) -# zy.append(zYfold) -# # compose the Z-space as the union of all folded predictions -# Z = np.vstack(Z) -# zy = np.vstack(zy) -# # refit the document projector with all examples to have a more reliable projector for test data -# self.doc_projector = self.doc_projector_bu -# return Z, zy -# -# def fit(self, lX, ly, lZ=None, lzy=None): -# tinit = time.time() -# Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly) -# -# #experimental: adds the posterior probabilities (computed outside) to the meta-classifier -# if lZ is not None and lzy is not None: -# zlangs = list(lZ.keys()) -# Z = np.vstack((Z, *[lZ[l] for l in zlangs])) -# zy = np.vstack((zy, *[lzy[l] for l in zlangs])) -# -# print('fitting the Z-space of shape={}'.format(Z.shape)) -# self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs) -# self.model.fit(Z, zy) -# self.time = time.time() - tinit -# -# return self -# -# def predict(self, lX, lZ=None): -# """ -# :param lX: a dictionary {language_label: X csr-matrix} -# :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation -# :return: a dictionary of predictions -# """ -# lZ_ = self._projection(self.doc_projector, lX) -# if lZ is not None: -# lZ_ = {**lZ_, **lZ} -# return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs) -# -# def best_params(self): -# params = self.doc_projector.best_params() -# params['meta'] = self.model.best_params() -# return params - - class NaivePolylingualClassifier: """ Is a mere set of independet MonolingualClassifiers