baseline multilingual Bert
This commit is contained in:
parent
9fd26e6ff7
commit
a594a84dab
|
@ -1,15 +1,9 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import time
|
import time
|
||||||
# from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
|
|
||||||
from scipy.sparse import issparse
|
from scipy.sparse import issparse
|
||||||
from sklearn.multiclass import OneVsRestClassifier
|
from sklearn.multiclass import OneVsRestClassifier
|
||||||
from sklearn.model_selection import GridSearchCV
|
from sklearn.model_selection import GridSearchCV
|
||||||
# from sklearn.model_selection import KFold
|
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
# from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
# from util_transformers.StandardizeTransformer import StandardizeTransformer
|
|
||||||
# from sklearn.decomposition import PCA
|
|
||||||
# from models.cnn_class_bu import CNN_pdr
|
|
||||||
|
|
||||||
|
|
||||||
def _sort_if_sparse(X):
|
def _sort_if_sparse(X):
|
||||||
|
@ -40,156 +34,6 @@ class TrivialRejector:
|
||||||
def best_params(self): return {}
|
def best_params(self): return {}
|
||||||
|
|
||||||
|
|
||||||
# class FunnellingPolylingualClassifier:
|
|
||||||
# """
|
|
||||||
# This classifier projects each document d into a language-independent feature space where each dimension fi is the
|
|
||||||
# decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
|
|
||||||
# then trains one single classifier for all documents in this space, irrespective of their originary language
|
|
||||||
# """
|
|
||||||
# def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
|
|
||||||
# calmode='cal', n_jobs=-1):
|
|
||||||
# """
|
|
||||||
# :param first_tier_learner: the learner used in the first-tier level
|
|
||||||
# :param meta_learner: the learner used in the second-tier level
|
|
||||||
# :param first_tier_parameters: parameters for the learner in the doc_projector
|
|
||||||
# :param meta_parameters: parameters for the learner in the z-space
|
|
||||||
# :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
|
|
||||||
# :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
|
|
||||||
# :param n_jobs: number of parallel threads
|
|
||||||
# 'sigmoid' to use the sigmoid of the decision_function
|
|
||||||
# projects the data before training the final classifier; if greater than one, the training set is split in as
|
|
||||||
# many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
|
|
||||||
# models trained on the remaining folds. This should increase the generality of the space to unseen data.
|
|
||||||
# """
|
|
||||||
# assert folded_projections>0, "positive number of folds expected"
|
|
||||||
# assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
|
|
||||||
# assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
|
|
||||||
#
|
|
||||||
# self.fist_tier_learner = first_tier_learner
|
|
||||||
# self.meta_learner = meta_learner
|
|
||||||
# self.fist_tier_parameters=first_tier_parameters
|
|
||||||
# self.meta_parameters = meta_parameters
|
|
||||||
# self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
|
|
||||||
# self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
|
|
||||||
# self.folded_projections = folded_projections
|
|
||||||
# self.n_jobs = n_jobs
|
|
||||||
# self.calmode = calmode
|
|
||||||
#
|
|
||||||
# def _projection(self, doc_projector, lX):
|
|
||||||
# """
|
|
||||||
# Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
|
|
||||||
# decision_function if otherwise
|
|
||||||
# :param doc_projector: the document projector (a NaivePolylingualClassifier)
|
|
||||||
# :param lX: {lang:matrix} to train
|
|
||||||
# :return: the projection, applied with predict_proba or decision_function
|
|
||||||
# """
|
|
||||||
# if self.calmode=='cal':
|
|
||||||
# return doc_projector.predict_proba(lX)
|
|
||||||
# else:
|
|
||||||
# l_decision_scores = doc_projector.decision_function(lX)
|
|
||||||
# if self.calmode=='sigmoid':
|
|
||||||
# def sigmoid(x): return 1 / (1 + np.exp(-x))
|
|
||||||
# for lang in l_decision_scores.keys():
|
|
||||||
# l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
|
|
||||||
# return l_decision_scores
|
|
||||||
#
|
|
||||||
# def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
|
|
||||||
# """
|
|
||||||
# Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
|
|
||||||
# decision scores (if otherwise). This space is here named zspace.
|
|
||||||
# :param lXtr: {lang:matrix} to train
|
|
||||||
# :param lYtr: {lang:labels} to train
|
|
||||||
# :param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
|
|
||||||
# :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
|
|
||||||
# :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
|
|
||||||
# models trained on lXtr, and the lYproj labels stacked consistently
|
|
||||||
# """
|
|
||||||
# repair_empty_folds = True
|
|
||||||
# if lXproj is None and lYproj is None:
|
|
||||||
# lXproj, lYproj = lXtr, lYtr
|
|
||||||
# repair_empty_folds = False
|
|
||||||
#
|
|
||||||
# print('fitting the projectors... {}'.format(lXtr.keys()))
|
|
||||||
# self.doc_projector.fit(lXtr, lYtr)
|
|
||||||
#
|
|
||||||
# print('projecting the documents')
|
|
||||||
# langs = list(lXtr.keys())
|
|
||||||
# lZ = self._projection(self.doc_projector, lXproj)
|
|
||||||
#
|
|
||||||
# # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
|
|
||||||
# empty_categories = self.doc_projector.empty_categories
|
|
||||||
# lZ_bu = self._projection(self.doc_projector_bu, lXproj)
|
|
||||||
#
|
|
||||||
# for lang in langs:
|
|
||||||
# repair = empty_categories[lang]
|
|
||||||
# lZ[lang][:,repair] = lZ_bu[lang][:,repair]
|
|
||||||
#
|
|
||||||
# Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
|
|
||||||
# zy = np.vstack([lYproj[lang] for lang in langs])
|
|
||||||
# return Z, zy
|
|
||||||
#
|
|
||||||
# def _get_zspace_folds(self, lX, ly):
|
|
||||||
# self.doc_projector_bu.fit(lX, ly)
|
|
||||||
#
|
|
||||||
# print('split of {} folds'.format(self.folded_projections))
|
|
||||||
# skf = KFold(n_splits=self.folded_projections, shuffle=True)
|
|
||||||
#
|
|
||||||
# Z, zy = [], []
|
|
||||||
# lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
|
|
||||||
# for fold in range(self.folded_projections):
|
|
||||||
# print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
|
|
||||||
# lfoldXtr, lfoldYtr = {}, {}
|
|
||||||
# lfoldXte, lfoldYte = {}, {}
|
|
||||||
# for lang in lX.keys():
|
|
||||||
# train, test = lfold[lang][fold]
|
|
||||||
# lfoldXtr[lang] = lX[lang][train]
|
|
||||||
# lfoldYtr[lang] = ly[lang][train]
|
|
||||||
# lfoldXte[lang] = lX[lang][test]
|
|
||||||
# lfoldYte[lang] = ly[lang][test]
|
|
||||||
# Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
|
|
||||||
# Z.append(Zfold)
|
|
||||||
# zy.append(zYfold)
|
|
||||||
# # compose the Z-space as the union of all folded predictions
|
|
||||||
# Z = np.vstack(Z)
|
|
||||||
# zy = np.vstack(zy)
|
|
||||||
# # refit the document projector with all examples to have a more reliable projector for test data
|
|
||||||
# self.doc_projector = self.doc_projector_bu
|
|
||||||
# return Z, zy
|
|
||||||
#
|
|
||||||
# def fit(self, lX, ly, lZ=None, lzy=None):
|
|
||||||
# tinit = time.time()
|
|
||||||
# Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
|
|
||||||
#
|
|
||||||
# #experimental: adds the posterior probabilities (computed outside) to the meta-classifier
|
|
||||||
# if lZ is not None and lzy is not None:
|
|
||||||
# zlangs = list(lZ.keys())
|
|
||||||
# Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
|
|
||||||
# zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
|
|
||||||
#
|
|
||||||
# print('fitting the Z-space of shape={}'.format(Z.shape))
|
|
||||||
# self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
|
|
||||||
# self.model.fit(Z, zy)
|
|
||||||
# self.time = time.time() - tinit
|
|
||||||
#
|
|
||||||
# return self
|
|
||||||
#
|
|
||||||
# def predict(self, lX, lZ=None):
|
|
||||||
# """
|
|
||||||
# :param lX: a dictionary {language_label: X csr-matrix}
|
|
||||||
# :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
|
|
||||||
# :return: a dictionary of predictions
|
|
||||||
# """
|
|
||||||
# lZ_ = self._projection(self.doc_projector, lX)
|
|
||||||
# if lZ is not None:
|
|
||||||
# lZ_ = {**lZ_, **lZ}
|
|
||||||
# return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
|
|
||||||
#
|
|
||||||
# def best_params(self):
|
|
||||||
# params = self.doc_projector.best_params()
|
|
||||||
# params['meta'] = self.model.best_params()
|
|
||||||
# return params
|
|
||||||
|
|
||||||
|
|
||||||
class NaivePolylingualClassifier:
|
class NaivePolylingualClassifier:
|
||||||
"""
|
"""
|
||||||
Is a mere set of independet MonolingualClassifiers
|
Is a mere set of independet MonolingualClassifiers
|
||||||
|
|
Loading…
Reference in New Issue