From 524ec37f8353c8fd2fefcd1c7d1eb60a3290ac36 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Tue, 12 Apr 2022 17:13:38 +0200 Subject: [PATCH] sample_size can now be set to None to indicate that the value has to be resolved by inspecting the environment variable SAMPLE_SIZE --- quapy/error.py | 2 +- quapy/evaluation.py | 60 +++++++++++++++++++++++----------------- quapy/model_selection.py | 8 ++++-- quapy/util.py | 10 +++++++ 4 files changed, 50 insertions(+), 30 deletions(-) diff --git a/quapy/error.py b/quapy/error.py index 3375470..a71ed46 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -215,7 +215,7 @@ def __check_eps(eps=None): CLASSIFICATION_ERROR = {f1e, acce} -QUANTIFICATION_ERROR = {mae, mrae, mse, mkld, mnkld} +QUANTIFICATION_ERROR = {mae, mrae, mse, mkld, mnkld, ae, rae, se, kld, nkld} QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, mkld, mnkld, mrae} CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR} QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR} diff --git a/quapy/evaluation.py b/quapy/evaluation.py index 0846ab0..936b83c 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -6,7 +6,7 @@ import inspect import quapy as qp from quapy.data import LabelledCollection from quapy.method.base import BaseQuantifier -from quapy.util import temp_seed +from quapy.util import temp_seed, _check_sample_size import quapy.functional as F import pandas as pd @@ -14,9 +14,9 @@ import pandas as pd def artificial_prevalence_prediction( model: BaseQuantifier, test: LabelledCollection, - sample_size, + sample_size=None, n_prevpoints=101, - repeats=1, + n_repetitions=1, eval_budget: int = None, n_jobs=1, random_seed=42, @@ -31,10 +31,11 @@ def artificial_prevalence_prediction( :param model: the model in charge of generating the class prevalence estimations :param test: the test set on which to perform APP - :param sample_size: integer, the size of the samples + :param sample_size: integer, the size of the samples; if None, then the sample size is + taken from qp.environ['SAMPLE_SIZE'] :param n_prevpoints: integer, the number of different prevalences to sample (or set to None if eval_budget is specified; default 101, i.e., steps of 1%) - :param repeats: integer, the number of repetitions for each prevalence (default 1) + :param n_repetitions: integer, the number of repetitions for each prevalence (default 1) :param eval_budget: integer, if specified, sets a ceil on the number of evaluations to perform. For example, if there are 3 classes, `repeats=1`, and `eval_budget=20`, then `n_prevpoints` will be set to 5, since this will generate 15 different prevalence vectors ([0, 0, 1], [0, 0.25, 0.75], [0, 0.5, 0.5] ... [1, 0, 0]) and @@ -48,10 +49,11 @@ def artificial_prevalence_prediction( for the samples generated while the second one contains the prevalence estimations """ - n_prevpoints, _ = qp.evaluation._check_num_evals(test.n_classes, n_prevpoints, eval_budget, repeats, verbose) + sample_size = _check_sample_size(sample_size) + n_prevpoints, _ = qp.evaluation._check_num_evals(test.n_classes, n_prevpoints, eval_budget, n_repetitions, verbose) with temp_seed(random_seed): - indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, repeats)) + indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions)) return _predict_from_indexes(indexes, model, test, n_jobs, verbose) @@ -59,8 +61,8 @@ def artificial_prevalence_prediction( def natural_prevalence_prediction( model: BaseQuantifier, test: LabelledCollection, - sample_size, - repeats, + sample_size=None, + repeats=100, n_jobs=1, random_seed=42, verbose=False): @@ -71,8 +73,9 @@ def natural_prevalence_prediction( :param model: the model in charge of generating the class prevalence estimations :param test: the test set on which to perform NPP - :param sample_size: integer, the size of the samples - :param repeats: integer, the number of samples to generate + :param sample_size: integer, the size of the samples; if None, then the sample size is + taken from qp.environ['SAMPLE_SIZE'] + :param repeats: integer, the number of samples to generate (default 100) :param n_jobs: integer, number of jobs to be run in parallel (default 1) :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect any other random process (default 42) @@ -82,6 +85,7 @@ def natural_prevalence_prediction( for the samples generated while the second one contains the prevalence estimations """ + sample_size = _check_sample_size(sample_size) with temp_seed(random_seed): indexes = list(test.natural_sampling_index_generator(sample_size, repeats)) @@ -162,9 +166,9 @@ def _predict_from_indexes( def artificial_prevalence_report( model: BaseQuantifier, test: LabelledCollection, - sample_size, + sample_size=None, n_prevpoints=101, - repeats=1, + n_repetitions=1, eval_budget: int = None, n_jobs=1, random_seed=42, @@ -184,10 +188,11 @@ def artificial_prevalence_report( :param model: the model in charge of generating the class prevalence estimations :param test: the test set on which to perform APP - :param sample_size: integer, the size of the samples + :param sample_size: integer, the size of the samples; if None, then the sample size is + taken from qp.environ['SAMPLE_SIZE'] :param n_prevpoints: integer, the number of different prevalences to sample (or set to None if eval_budget is specified; default 101, i.e., steps of 1%) - :param repeats: integer, the number of repetitions for each prevalence (default 1) + :param n_repetitions: integer, the number of repetitions for each prevalence (default 1) :param eval_budget: integer, if specified, sets a ceil on the number of evaluations to perform. For example, if there are 3 classes, `repeats=1`, and `eval_budget=20`, then `n_prevpoints` will be set to 5, since this will generate 15 different prevalence vectors ([0, 0, 1], [0, 0.25, 0.75], [0, 0.5, 0.5] ... [1, 0, 0]) and @@ -205,7 +210,7 @@ def artificial_prevalence_report( """ true_prevs, estim_prevs = artificial_prevalence_prediction( - model, test, sample_size, n_prevpoints, repeats, eval_budget, n_jobs, random_seed, verbose + model, test, sample_size, n_prevpoints, n_repetitions, eval_budget, n_jobs, random_seed, verbose ) return _prevalence_report(true_prevs, estim_prevs, error_metrics) @@ -213,8 +218,8 @@ def artificial_prevalence_report( def natural_prevalence_report( model: BaseQuantifier, test: LabelledCollection, - sample_size, - repeats=1, + sample_size=None, + repeats=100, n_jobs=1, random_seed=42, error_metrics:Iterable[Union[str,Callable]]='mae', @@ -230,8 +235,9 @@ def natural_prevalence_report( :param model: the model in charge of generating the class prevalence estimations :param test: the test set on which to perform NPP - :param sample_size: integer, the size of the samples - :param repeats: integer, the number of samples to generate + :param sample_size: integer, the size of the samples; if None, then the sample size is + taken from qp.environ['SAMPLE_SIZE'] + :param repeats: integer, the number of samples to generate (default 100) :param n_jobs: integer, number of jobs to be run in parallel (default 1) :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect any other random process (default 42) @@ -244,7 +250,7 @@ def natural_prevalence_report( for the samples generated while the second one contains the prevalence estimations """ - + sample_size = _check_sample_size(sample_size) true_prevs, estim_prevs = natural_prevalence_prediction( model, test, sample_size, repeats, n_jobs, random_seed, verbose ) @@ -300,7 +306,7 @@ def _prevalence_report( def artificial_prevalence_protocol( model: BaseQuantifier, test: LabelledCollection, - sample_size, + sample_size=None, n_prevpoints=101, repeats=1, eval_budget: int = None, @@ -318,7 +324,8 @@ def artificial_prevalence_protocol( :param model: the model in charge of generating the class prevalence estimations :param test: the test set on which to perform APP - :param sample_size: integer, the size of the samples + :param sample_size: integer, the size of the samples; if None, then the sample size is + taken from qp.environ['SAMPLE_SIZE'] :param n_prevpoints: integer, the number of different prevalences to sample (or set to None if eval_budget is specified; default 101, i.e., steps of 1%) :param repeats: integer, the number of repetitions for each prevalence (default 1) @@ -350,8 +357,8 @@ def artificial_prevalence_protocol( def natural_prevalence_protocol( model: BaseQuantifier, test: LabelledCollection, - sample_size, - repeats=1, + sample_size=None, + repeats=100, n_jobs=1, random_seed=42, error_metric:Union[str,Callable]='mae', @@ -363,7 +370,8 @@ def natural_prevalence_protocol( :param model: the model in charge of generating the class prevalence estimations :param test: the test set on which to perform NPP - :param sample_size: integer, the size of the samples + :param sample_size: integer, the size of the samples; if None, then the sample size is + taken from qp.environ['SAMPLE_SIZE'] :param repeats: integer, the number of samples to generate :param n_jobs: integer, number of jobs to be run in parallel (default 1) :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 5af4b2f..86e79f3 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -11,6 +11,8 @@ from quapy.evaluation import artificial_prevalence_prediction, natural_prevalenc from quapy.method.aggregative import BaseQuantifier import inspect +from util import _check_sample_size + class GridSearchQ(BaseQuantifier): """Grid Search optimization targeting a quantification-oriented metric. @@ -57,7 +59,7 @@ class GridSearchQ(BaseQuantifier): def __init__(self, model: BaseQuantifier, param_grid: dict, - sample_size: Union[int, None], + sample_size: Union[int, None] = None, protocol='app', n_prevpoints: int = None, n_repetitions: int = 1, @@ -105,7 +107,7 @@ class GridSearchQ(BaseQuantifier): return training, validation elif isinstance(validation, float): assert 0. < validation < 1., 'validation proportion should be in (0,1)' - training, validation = training.split_stratified(train_prop=1 - validation) + training, validation = training.split_stratified(train_prop=1 - validation, random_state=self.random_seed) return training, validation elif self.protocol=='gen' and inspect.isgenerator(validation()): return training, validation @@ -163,7 +165,7 @@ class GridSearchQ(BaseQuantifier): val_split = self.val_split training, val_split = self.__check_training_validation(training, val_split) if self.protocol != 'gen': - assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer' + self.sample_size = _check_sample_size(self.sample_size) params_keys = list(self.param_grid.keys()) params_values = list(self.param_grid.values()) diff --git a/quapy/util.py b/quapy/util.py index 9d44633..12ffc23 100644 --- a/quapy/util.py +++ b/quapy/util.py @@ -176,6 +176,16 @@ def pickled_resource(pickle_path:str, generation_func:callable, *args): return instance +def _check_sample_size(sample_size): + if sample_size is None: + assert qp.environ['SAMPLE_SIZE'] is not None, \ + 'error: sample_size set to None, and cannot be resolved from the environment' + sample_size = qp.environ['SAMPLE_SIZE'] + assert isinstance(sample_size, int) and sample_size > 0, \ + 'error: sample_size is not a positive integer' + return sample_size + + class EarlyStop: """ A class implementing the early-stopping condition typically used for training neural networks.