model seletion in two levels, classifier oriented and quantifier oriented

This commit is contained in:
Alejandro Moreo Fernandez 2023-11-16 14:29:34 +01:00
parent e870d798b7
commit 513c78f1f3
4 changed files with 296 additions and 147 deletions

View File

@ -2,7 +2,9 @@ import quapy as qp
from quapy.protocol import APP
from quapy.method.aggregative import DMy
from sklearn.linear_model import LogisticRegression
from examples.comparing_gridsearch import OLD_GridSearchQ
import numpy as np
from time import time
"""
In this example, we show how to perform model selection on a DistributionMatching quantifier.
@ -15,35 +17,44 @@ qp.environ['N_JOBS'] = -1
training, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
# The model will be returned by the fit method of GridSearchQ.
# Every combination of hyper-parameters will be evaluated by confronting the
# quantifier thus configured against a series of samples generated by means
# of a sample generation protocol. For this example, we will use the
# artificial-prevalence protocol (APP), that generates samples with prevalence
# values in the entire range of values from a grid (e.g., [0, 0.1, 0.2, ..., 1]).
# We devote 30% of the dataset for this exploration.
training, validation = training.split_stratified(train_prop=0.7)
protocol = APP(validation)
with qp.util.temp_seed(0):
# We will explore a classification-dependent hyper-parameter (e.g., the 'C'
# hyper-parameter of LogisticRegression) and a quantification-dependent hyper-parameter
# (e.g., the number of bins in a DistributionMatching quantifier.
# Classifier-dependent hyper-parameters have to be marked with a prefix "classifier__"
# in order to let the quantifier know this hyper-parameter belongs to its underlying
# classifier.
param_grid = {
'classifier__C': np.logspace(-3,3,7),
'nbins': [8, 16, 32, 64],
}
# The model will be returned by the fit method of GridSearchQ.
# Every combination of hyper-parameters will be evaluated by confronting the
# quantifier thus configured against a series of samples generated by means
# of a sample generation protocol. For this example, we will use the
# artificial-prevalence protocol (APP), that generates samples with prevalence
# values in the entire range of values from a grid (e.g., [0, 0.1, 0.2, ..., 1]).
# We devote 30% of the dataset for this exploration.
training, validation = training.split_stratified(train_prop=0.7)
protocol = APP(validation)
model = qp.model_selection.GridSearchQ(
model=model,
param_grid=param_grid,
protocol=protocol,
error='mae', # the error to optimize is the MAE (a quantification-oriented loss)
refit=True, # retrain on the whole labelled set once done
verbose=True # show information as the process goes on
).fit(training)
# We will explore a classification-dependent hyper-parameter (e.g., the 'C'
# hyper-parameter of LogisticRegression) and a quantification-dependent hyper-parameter
# (e.g., the number of bins in a DistributionMatching quantifier.
# Classifier-dependent hyper-parameters have to be marked with a prefix "classifier__"
# in order to let the quantifier know this hyper-parameter belongs to its underlying
# classifier.
param_grid = {
'classifier__C': np.logspace(-3,3,7),
'classifier__class_weight': ['balanced', None],
'nbins': [8, 16, 32, 64],
}
tinit = time()
# model = OLD_GridSearchQ(
model = qp.model_selection.GridSearchQ(
model=model,
param_grid=param_grid,
protocol=protocol,
error='mae', # the error to optimize is the MAE (a quantification-oriented loss)
refit=False, # retrain on the whole labelled set once done
verbose=True # show information as the process goes on
).fit(training)
tend = time()
print(f'model selection ended: best hyper-parameters={model.best_params_}')
model = model.best_model_
@ -53,5 +64,5 @@ model = model.best_model_
mae_score = qp.evaluation.evaluate(model, protocol=APP(test), error_metric='mae')
print(f'MAE={mae_score:.5f}')
print(f'model selection took {tend-tinit}s')

View File

@ -37,7 +37,20 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
and :meth:`aggregate`.
"""
def fit(self, data: LabelledCollection, fit_classifier=True):
val_split_ = None
@property
def val_split(self):
return self.val_split_
@val_split.setter
def val_split(self, val_split):
if isinstance(val_split, LabelledCollection):
print('warning: setting val_split with a LabelledCollection will be inefficient in'
'model selection. Rather pass the LabelledCollection at fit time')
self.val_split_ = val_split
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
"""
Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function.
@ -46,7 +59,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
learner has been trained outside the quantifier.
:return: self
"""
classif_predictions = self.classifier_fit_predict(data, fit_classifier)
classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on=val_split)
self.aggregation_fit(classif_predictions, data)
return self
@ -69,6 +82,9 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba'))
if predict_on is None:
predict_on = self.val_split
if predict_on is None:
if fit_classifier:
self.classifier.fit(*data.Xy)
@ -228,7 +244,6 @@ class AggregativeCrispQuantifier(AggregativeQuantifier, ABC):
:return: the string "predict", i.e., the standard method name for scikit-learn hard predictions
"""
print('using predict')
return 'predict'
def _check_classifier(self, adapt_if_necessary=False):
@ -264,7 +279,6 @@ class AggregativeSoftQuantifier(AggregativeQuantifier, ABC):
:return: the string "predict_proba", i.e., the standard method name for scikit-learn soft predictions
"""
print('using predict_proba')
return 'predict_proba'
def _check_classifier(self, adapt_if_necessary=False):
@ -289,35 +303,35 @@ class AggregativeSoftQuantifier(AggregativeQuantifier, ABC):
class CorrectionbasedAggregativeQuantifier(AggregativeQuantifier):
"""
Abstract class for quantification methods that carry out an adjustment (or correction) that requires,
at training time, the predictions to be issued in validation mode, i.e., on a set of held-out data that
is not the training set. There are three ways in which this distinction can be made, depending on how
the internal parameter `val_split` is specified, namely, (i) a float in (0, 1) indicating the proportion
of training instances that should be devoted to validate, or (ii) an integer indicating the
number of folds to consider in a k-fold cross-validation mode, or (iii) the specific set of data to
use for validation.
"""
@property
def val_split(self):
return self.val_split_
@val_split.setter
def val_split(self, val_split):
if isinstance(val_split, LabelledCollection):
print('warning: setting val_split with a LabelledCollection will be inefficient in'
'model selection. Rather pass the LabelledCollection at fit time')
self.val_split_ = val_split
def fit(self, data: LabelledCollection, fit_classifier=True, predict_on=None):
print('method from CorrectionbasedAggregativeQuantifier')
if predict_on is None:
predict_on = self.val_split
classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on)
self.aggregation_fit(classif_predictions, data)
return self
# class CorrectionbasedAggregativeQuantifier(AggregativeQuantifier):
# """
# Abstract class for quantification methods that carry out an adjustment (or correction) that requires,
# at training time, the predictions to be issued in validation mode, i.e., on a set of held-out data that
# is not the training set. There are three ways in which this distinction can be made, depending on how
# the internal parameter `val_split` is specified, namely, (i) a float in (0, 1) indicating the proportion
# of training instances that should be devoted to validate, or (ii) an integer indicating the
# number of folds to consider in a k-fold cross-validation mode, or (iii) the specific set of data to
# use for validation.
# """
#
# @property
# def val_split(self):
# return self.val_split_
#
# @val_split.setter
# def val_split(self, val_split):
# if isinstance(val_split, LabelledCollection):
# print('warning: setting val_split with a LabelledCollection will be inefficient in'
# 'model selection. Rather pass the LabelledCollection at fit time')
# self.val_split_ = val_split
#
# def fit(self, data: LabelledCollection, fit_classifier=True, predict_on=None):
# print('method from CorrectionbasedAggregativeQuantifier')
# if predict_on is None:
# predict_on = self.val_split
# classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on)
# self.aggregation_fit(classif_predictions, data)
# return self
@ -352,7 +366,7 @@ class CC(AggregativeCrispQuantifier):
return F.prevalence_from_labels(classif_predictions, self.classes_)
class ACC(AggregativeCrispQuantifier, CorrectionbasedAggregativeQuantifier):
class ACC(AggregativeCrispQuantifier):
"""
`Adjusted Classify & Count <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_,
the "adjusted" variant of :class:`CC`, that corrects the predictions of CC
@ -447,7 +461,7 @@ class PCC(AggregativeSoftQuantifier):
return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
class PACC(AggregativeSoftQuantifier, CorrectionbasedAggregativeQuantifier):
class PACC(AggregativeSoftQuantifier):
"""
`Probabilistic Adjusted Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_,
the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier.
@ -570,7 +584,7 @@ class EMQ(AggregativeSoftQuantifier):
return qs, ps
class EMQrecalib(AggregativeSoftQuantifier, CorrectionbasedAggregativeQuantifier):
class EMQrecalib(AggregativeSoftQuantifier):
"""
`Expectation Maximization for Quantification <https://ieeexplore.ieee.org/abstract/document/6789744>`_ (EMQ),
aka `Saerens-Latinne-Decaestecker` (SLD) algorithm, with the heuristics proposed by
@ -657,7 +671,7 @@ class EMQrecalib(AggregativeSoftQuantifier, CorrectionbasedAggregativeQuantifier
return posteriors
class HDy(AggregativeSoftQuantifier, BinaryQuantifier, CorrectionbasedAggregativeQuantifier):
class HDy(AggregativeSoftQuantifier, BinaryQuantifier):
"""
`Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy).
HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of
@ -844,7 +858,7 @@ class SMM(AggregativeSoftQuantifier, BinaryQuantifier):
return np.asarray([1 - class1_prev, class1_prev])
class DMy(AggregativeSoftQuantifier, CorrectionbasedAggregativeQuantifier):
class DMy(AggregativeSoftQuantifier):
"""
Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of posterior
probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF
@ -865,7 +879,7 @@ class DMy(AggregativeSoftQuantifier, CorrectionbasedAggregativeQuantifier):
:param n_jobs: number of parallel workers (default None)
"""
def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD',
def __init__(self, classifier, val_split=5, nbins=8, divergence: Union[str, Callable]='HD',
cdf=False, search='optim_minimize', n_jobs=None):
self.classifier = classifier
self.val_split = val_split
@ -875,15 +889,15 @@ class DMy(AggregativeSoftQuantifier, CorrectionbasedAggregativeQuantifier):
self.search = search
self.n_jobs = n_jobs
@classmethod
def HDy(cls, classifier, val_split=0.4, n_jobs=None):
from quapy.method.meta import MedianEstimator
# @classmethod
# def HDy(cls, classifier, val_split=0.4, n_jobs=None):
# from quapy.method.meta import MedianEstimator
#
# hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD')
# hdy = AggregativeMedianEstimator(hdy, param_grid={'nbins': np.linspace(10, 110, 11).astype(int)}, n_jobs=n_jobs)
# return hdy
hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD')
hdy = AggregativeMedianEstimator(hdy, param_grid={'nbins': np.linspace(10, 110, 11).astype(int)}, n_jobs=n_jobs)
return hdy
def __get_distributions(self, posteriors):
def _get_distributions(self, posteriors):
histograms = []
post_dims = posteriors.shape[1]
if post_dims == 2:
@ -919,9 +933,10 @@ class DMy(AggregativeSoftQuantifier, CorrectionbasedAggregativeQuantifier):
n_classes = len(self.classifier.classes_)
self.validation_distribution = qp.util.parallel(
func=self.__get_distributions,
func=self._get_distributions,
args=[posteriors[true_labels==cat] for cat in range(n_classes)],
n_jobs=self.n_jobs
n_jobs=self.n_jobs,
backend='threading'
)
def aggregate(self, posteriors: np.ndarray):
@ -935,7 +950,7 @@ class DMy(AggregativeSoftQuantifier, CorrectionbasedAggregativeQuantifier):
:param posteriors: posterior probabilities of the instances in the sample
:return: a vector of class prevalence estimates
"""
test_distribution = self.__get_distributions(posteriors)
test_distribution = self._get_distributions(posteriors)
divergence = get_divergence(self.divergence)
n_classes, n_channels, nbins = self.validation_distribution.shape
def loss(prev):
@ -1449,13 +1464,10 @@ class AggregativeMedianEstimator(BinaryQuantifier):
def _delayed_fit_aggregation(self, args):
with qp.util.temp_seed(self.random_state):
print('\tenter job')
((model, predictions), q_params), training = args
model = deepcopy(model)
print('fitaggr', model, predictions, len(predictions), print(self.training))
model.set_params(**q_params)
model.aggregation_fit(predictions, training)
print('\texit job')
return model
@ -1473,7 +1485,8 @@ class AggregativeMedianEstimator(BinaryQuantifier):
((params, training, kwargs) for params in cls_configs),
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs,
asarray=False
asarray=False,
backend='threading'
)
else:
print('only 1')
@ -1482,27 +1495,13 @@ class AggregativeMedianEstimator(BinaryQuantifier):
predictions = model.classifier_fit_predict(training, **kwargs)
models_preds = [(model, predictions)]
self.training = training
self.models = []
print('WITHOUT PARALLEL JOBS')
for ((model, predictions), q_params) in itertools.product(models_preds, q_configs):
print('\tenter job')
model = deepcopy(model)
print('fitaggr', model, predictions, len(predictions), print(self.training))
model.set_params(**q_params)
model.aggregation_fit(predictions, training)
self.models.append(model)
print('\texit job')
# self.models = qp.util.parallel(
# self._delayed_fit_aggregation,
# ((setup, training) for setup in itertools.product(models_preds, q_configs)),
# seed=qp.environ.get('_R_SEED', None),
# n_jobs=self.n_jobs,
# asarray=False
# )
self.models = qp.util.parallel(
self._delayed_fit_aggregation,
((setup, training) for setup in itertools.product(models_preds, q_configs)),
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs,
backend='threading'
)
else:
configs = qp.model_selection.expand_grid(self.param_grid)
self.models = qp.util.parallel(
@ -1510,7 +1509,7 @@ class AggregativeMedianEstimator(BinaryQuantifier):
((params, training) for params in configs),
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs,
asarray=False
backend='threading'
)
return self
@ -1524,9 +1523,8 @@ class AggregativeMedianEstimator(BinaryQuantifier):
((model, instances) for model in self.models),
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs,
asarray=False
backend='threading'
)
prev_preds = np.asarray(prev_preds)
return np.median(prev_preds, axis=0)
#---------------------------------------------------------------

View File

@ -1,6 +1,7 @@
import itertools
import signal
from copy import deepcopy
from enum import Enum
from typing import Union, Callable
import numpy as np
@ -10,10 +11,16 @@ import quapy as qp
from quapy import evaluation
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
from quapy.data.base import LabelledCollection
from quapy.method.aggregative import BaseQuantifier
from quapy.method.aggregative import BaseQuantifier, AggregativeQuantifier
from time import time
class Status(Enum):
SUCCESS = 1
TIMEOUT = 2
INVALID = 3
ERROR = 4
class GridSearchQ(BaseQuantifier):
"""Grid Search optimization targeting a quantification-oriented metric.
@ -69,21 +76,7 @@ class GridSearchQ(BaseQuantifier):
raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n'
f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}')
def fit(self, training: LabelledCollection):
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
the error metric.
:param training: the training set on which to optimize the hyperparameters
:return: self
"""
protocol = self.protocol
self.param_scores_ = {}
self.best_score_ = None
tinit = time()
def _fit_nonaggregative(self, training):
configs = expand_grid(self.param_grid)
self._sout(f'starting model selection with {self.n_jobs =}')
@ -94,34 +87,106 @@ class GridSearchQ(BaseQuantifier):
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs
)
return scores
for params, score, model in scores:
if score is not None:
if self.best_score_ is None or score < self.best_score_:
self.best_score_ = score
self.best_params_ = params
self.best_model_ = model
self.param_scores_[str(params)] = score
else:
self.param_scores_[str(params)] = 'timeout'
def _delayed_fit_classifier(self, args):
cls_params, training = args
model = deepcopy(self.model)
model.set_params(**cls_params)
predictions = model.classifier_fit_predict(training)
return (model, predictions, cls_params)
tend = time()-tinit
def _eval_aggregative(self, args):
((model, predictions, cls_params), q_params), training = args
model = deepcopy(model)
# overrides default parameters with the parameters being explored at this iteration
model.set_params(**q_params)
model.aggregation_fit(predictions, training)
params = {**cls_params, **q_params}
return model, params
if self.best_score_ is None:
raise TimeoutError('no combination of hyperparameters seem to work')
def _delayed_evaluation__(self, args):
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
f'[took {tend:.4f}s]')
exit_status = Status.SUCCESS
if self.refit:
if isinstance(protocol, OnLabelledCollectionProtocol):
self._sout(f'refitting on the whole development set')
self.best_model_.fit(training + protocol.get_labelled_collection())
else:
raise RuntimeWarning(f'"refit" was requested, but the protocol does not '
f'implement the {OnLabelledCollectionProtocol.__name__} interface')
tinit = time()
if self.timeout > 0:
def handler(signum, frame):
raise TimeoutError()
return self
signal.signal(signal.SIGALRM, handler)
signal.alarm(self.timeout)
try:
model, params = self._eval_aggregative(args)
score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
ttime = time() - tinit
self._sout(f'hyperparams=[{params}]\t got {self.error.__name__} score {score:.5f} [took {ttime:.4f}s]')
if self.timeout > 0:
signal.alarm(0)
except TimeoutError:
self._sout(f'timeout ({self.timeout}s) reached for config {params}')
score = None
exit_status = Status.TIMEOUT
except ValueError as e:
self._sout(f'the combination of hyperparameters {params} is invalid')
score = None
exit_status = Status.INVALID
except Exception as e:
self._sout(f'something went wrong for config {params}; skipping:')
self._sout(f'\tException: {e}')
score = None
exit_status = Status.ERROR
return params, score, model, exit_status
# def _delayed_fit_aggregation_and_eval(self, args):
#
# ((model, predictions, cls_params), q_params), training = args
# exit_status = Status.SUCCESS
#
# tinit = time()
# if self.timeout > 0:
# def handler(signum, frame):
# raise TimeoutError()
# signal.signal(signal.SIGALRM, handler)
# signal.alarm(self.timeout)
#
# try:
# model = deepcopy(model)
# # overrides default parameters with the parameters being explored at this iteration
# model.set_params(**q_params)
# model.aggregation_fit(predictions, training)
# score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error)
#
# ttime = time() - tinit
# self._sout(f'hyperparams=[cls:{cls_params}, q:{q_params}]\t got {self.error.__name__} score {score:.5f} [took {ttime:.4f}s]')
#
# if self.timeout > 0:
# signal.alarm(0)
# except TimeoutError:
# self._sout(f'timeout ({self.timeout}s) reached for config {q_params}')
# score = None
# exit_status = Status.TIMEOUT
# except ValueError as e:
# self._sout(f'the combination of hyperparameters {q_params} is invalid')
# score = None
# exit_status = Status.INVALID
# except Exception as e:
# self._sout(f'something went wrong for config {q_params}; skipping:')
# self._sout(f'\tException: {e}')
# score = None
# exit_status = Status.ERROR
#
# params = {**cls_params, **q_params}
# return params, score, model, exit_status
def _delayed_eval(self, args):
params, training = args
@ -163,8 +228,83 @@ class GridSearchQ(BaseQuantifier):
self._sout(f'\tException: {e}')
score = None
return params, score, model
return params, score, model, status
def _fit_aggregative(self, training):
# break down the set of hyperparameters into two: classifier-specific, quantifier-specific
cls_configs, q_configs = group_params(self.param_grid)
# train all classifiers and get the predictions
models_preds_clsconfigs = qp.util.parallel(
self._delayed_fit_classifier,
((params, training) for params in cls_configs),
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs,
asarray=False,
)
# explore the quantifier-specific hyperparameters for each training configuration
scores = qp.util.parallel(
self._delayed_fit_aggregation_and_eval,
((setup, training) for setup in itertools.product(models_preds_clsconfigs, q_configs)),
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs
)
return scores
def fit(self, training: LabelledCollection):
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
the error metric.
:param training: the training set on which to optimize the hyperparameters
:return: self
"""
if self.refit and not isinstance(self.protocol, OnLabelledCollectionProtocol):
raise RuntimeWarning(f'"refit" was requested, but the protocol does not '
f'implement the {OnLabelledCollectionProtocol.__name__} interface')
tinit = time()
if isinstance(self.model, AggregativeQuantifier):
self.results = self._fit_aggregative(training)
else:
self.results = self._fit_nonaggregative(training)
self.param_scores_ = {}
self.best_score_ = None
for params, score, model in self.results:
if score is not None:
if self.best_score_ is None or score < self.best_score_:
self.best_score_ = score
self.best_params_ = params
self.best_model_ = model
self.param_scores_[str(params)] = score
else:
self.param_scores_[str(params)] = 'timeout'
tend = time()-tinit
if self.best_score_ is None:
raise TimeoutError('no combination of hyperparameters seem to work')
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
f'[took {tend:.4f}s]')
if self.refit:
if isinstance(self.protocol, OnLabelledCollectionProtocol):
tinit = time()
self._sout(f'refitting on the whole development set')
self.best_model_.fit(training + self.protocol.get_labelled_collection())
tend = time() - tinit
self.refit_time_ = tend
else:
raise RuntimeWarning(f'the model cannot be refit on the whole dataset')
return self
def quantify(self, instances):
"""Estimate class prevalence values using the best model found after calling the :meth:`fit` method.

View File

@ -38,7 +38,7 @@ def map_parallel(func, args, n_jobs):
return list(itertools.chain.from_iterable(results))
def parallel(func, args, n_jobs, seed=None, asarray=True):
def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
"""
A wrapper of multiprocessing:
@ -58,7 +58,7 @@ def parallel(func, args, n_jobs, seed=None, asarray=True):
stack.enter_context(qp.util.temp_seed(seed))
return func(*args)
out = Parallel(n_jobs=n_jobs)(
out = Parallel(n_jobs=n_jobs, backend=backend)(
delayed(func_dec)(qp.environ, None if seed is None else seed+i, args_i) for i, args_i in enumerate(args)
)
if asarray: