QuaPy/quapy/model_selection.py

import itertools
import signal
from copy import deepcopy
from typing import Union, Callable

import quapy as qp
import quapy.functional as F
from quapy.data.base import LabelledCollection
from quapy.evaluation import artificial_sampling_prediction
from quapy.method.aggregative import BaseQuantifier


class GridSearchQ(BaseQuantifier):

    def __init__(self,
                 model: BaseQuantifier,
                 param_grid: dict,
                 sample_size: int,
                 n_prevpoints: int = None,
                 n_repetitions: int = 1,
                 eval_budget: int = None,
                 error: Union[Callable, str] = qp.error.mae,
                 refit=False,
                 val_split=0.4,
                 n_jobs=1,
                 random_seed=42,
                 timeout=-1,
                 verbose=False):
        """
        Optimizes the hyperparameters of a quantification method, based on an evaluation method and on an evaluation
        protocol for quantification.
        :param model: the quantifier to optimize
        :param training: the training set on which to optimize the hyperparameters
        :param validation: either a LabelledCollection on which to test the performance of the different settings, or
        a float in [0,1] indicating the proportion of labelled data to extract from the training set
        :param param_grid: a dictionary with keys the parameter names and values the list of values to explore for
        that particular parameter
        :param sample_size: the size of the samples to extract from the validation set
        :param n_prevpoints: if specified, indicates the number of equally distant point to extract from the interval
        [0,1] in order to define the prevalences of the samples; e.g., if n_prevpoints=5, then the prevalences for
        each class will be explored in [0.00, 0.25, 0.50, 0.75, 1.00]. If not specified, then eval_budget is requested
        :param n_repetitions: the number of repetitions for each combination of prevalences. This parameter is ignored
        if eval_budget is set and is lower than the number of combinations that would be generated using the value
        assigned to n_prevpoints (for the current number of classes and n_repetitions)
        :param eval_budget: if specified, sets a ceil on the number of evaluations to perform for each hyper-parameter
        combination. For example, if there are 3 classes, n_repetitions=1 and eval_budget=20, then n_prevpoints will be
        set to 5, since this will generate 15 different prevalences:
         [0, 0, 1], [0, 0.25, 0.75], [0, 0.5, 0.5] ... [1, 0, 0]
        :param error: an error function (callable) or a string indicating the name of an error function (valid ones
        are those in qp.error.QUANTIFICATION_ERROR
        :param refit: whether or not to refit the model on the whole labelled collection (training+validation) with
        the best chosen hyperparameter combination
        :param n_jobs: number of parallel jobs
        :param random_seed: set the seed of the random generator to replicate experiments
        :param timeout: establishes a timer (in seconds) for each of the hyperparameters configurations being tested.
        Whenever a run takes longer than this timer, that configuration will be ignored. If all configurations end up
        being ignored, a TimeoutError exception is raised. If -1 (default) then no time bound is set.
        :param verbose: set to True to get information through the stdout
        """
        self.model = model
        self.param_grid = param_grid
        self.sample_size = sample_size
        self.n_prevpoints = n_prevpoints
        self.n_repetitions = n_repetitions
        self.eval_budget = eval_budget
        self.refit = refit
        self.val_split = val_split
        self.n_jobs = n_jobs
        self.random_seed = random_seed
        self.timeout = timeout
        self.verbose = verbose
        self.__check_error(error)

    def sout(self, msg):
        if self.verbose:
            print(f'[{self.__class__.__name__}]: {msg}')

    def __check_training_validation(self, training, validation):
        if isinstance(validation, LabelledCollection):
            return training, validation
        elif isinstance(validation, float):
            assert 0. < validation < 1., 'validation proportion should be in (0,1)'
            training, validation = training.split_stratified(train_prop=1-validation)
            return training, validation
        else:
            raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
                             f'proportion of training documents to extract (found) {type(validation)}')

    def __check_error(self, error):
        if error in qp.error.QUANTIFICATION_ERROR:
            self.error = error
        elif isinstance(error, str):
            self.error = qp.error.from_name(error)
        elif hasattr(error, '__call__'):
            self.error = error
        else:
            raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n'
                             f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}')

    def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float]=None):
        """
        :param training: the training set on which to optimize the hyperparameters
        :param val_split: either a LabelledCollection on which to test the performance of the different settings, or
        a float in [0,1] indicating the proportion of labelled data to extract from the training set
        """
        if val_split is None:
            val_split = self.val_split
        training, val_split = self.__check_training_validation(training, val_split)
        assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'

        params_keys = list(self.param_grid.keys())
        params_values = list(self.param_grid.values())

        model = self.model
        n_jobs = self.n_jobs

        if self.timeout > 0:
            def handler(signum, frame):
                self.sout('timeout reached')
                raise TimeoutError()
            signal.signal(signal.SIGALRM, handler)

        self.sout(f'starting optimization with n_jobs={n_jobs}')
        self.param_scores_ = {}
        self.best_score_ = None
        some_timeouts = False
        for values in itertools.product(*params_values):
            params = dict({k: values[i] for i, k in enumerate(params_keys)})

            if self.timeout > 0:
                signal.alarm(self.timeout)

            try:
                # overrides default parameters with the parameters being explored at this iteration
                model.set_params(**params)
                model.fit(training)
                true_prevalences, estim_prevalences = artificial_sampling_prediction(
                    model, val_split, self.sample_size,
                    n_prevpoints=self.n_prevpoints,
                    n_repetitions=self.n_repetitions,
                    eval_budget=self.eval_budget,
                    n_jobs=n_jobs,
                    random_seed=self.random_seed,
                    verbose=False
                )

                score = self.error(true_prevalences, estim_prevalences)
                self.sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}')
                if self.best_score_ is None or score < self.best_score_:
                    self.best_score_ = score
                    self.best_params_ = params
                    if not self.refit:
                        self.best_model_ = deepcopy(model)
                self.param_scores_[str(params)] = score

                if self.timeout > 0:
                    signal.alarm(0)
            except TimeoutError:
                print(f'timeout reached for config {params}')
                some_timeouts = True

        if self.best_score_ is None and some_timeouts:
            raise TimeoutError('all jobs took more than the timeout time to end')

        self.sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
        model.set_params(**self.best_params_)
        self.best_model_ = deepcopy(model)

        if self.refit:
            self.sout(f'refitting on the whole development set')
            self.best_model_.fit(training + val_split)

        return self

    def quantify(self, instances):
        return self.best_model_.quantify(instances)

    def set_params(self, **parameters):
        self.param_grid = parameters

    def get_params(self, deep=True):
        return self.param_grid

    def best_model(self):
        if hasattr(self, 'best_model_'):
            return self.best_model_
        raise ValueError('best_model called before fit')
model selection for quantification added 2020-12-23 11:14:35 +01:00			`import itertools`
import fixes 2021-01-15 18:32:32 +01:00			`import signal`
			`from copy import deepcopy`
			`from typing import Union, Callable`

model selection for quantification added 2020-12-23 11:14:35 +01:00			`import quapy as qp`
import fixes 2021-01-15 18:32:32 +01:00			`import quapy.functional as F`
imports fix 2021-03-19 17:34:09 +01:00			`from quapy.data.base import LabelledCollection`
import fixes 2021-01-15 18:32:32 +01:00			`from quapy.evaluation import artificial_sampling_prediction`
			`from quapy.method.aggregative import BaseQuantifier`
model selection for quantification added 2020-12-23 11:14:35 +01:00

added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`class GridSearchQ(BaseQuantifier):`
model selection for quantification added 2020-12-23 11:14:35 +01:00
			`def __init__(self,`
added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`model: BaseQuantifier,`
model selection for quantification added 2020-12-23 11:14:35 +01:00			`param_grid: dict,`
			`sample_size: int,`
			`n_prevpoints: int = None,`
			`n_repetitions: int = 1,`
adding eval_budget to evaluation functions 2021-02-09 11:48:16 +01:00			`eval_budget: int = None,`
model selection for quantification added 2020-12-23 11:14:35 +01:00			`error: Union[Callable, str] = qp.error.mae,`
			`refit=False,`
refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size 2021-01-22 18:01:51 +01:00			`val_split=0.4,`
quapy fixed 2021-01-22 09:58:12 +01:00			`n_jobs=1,`
model selection for quantification added 2020-12-23 11:14:35 +01:00			`random_seed=42,`
setting a timeout for model_selection combinations in order to prevent some combinations to stuck the model selection 2021-01-15 17:42:19 +01:00			`timeout=-1,`
model selection for quantification added 2020-12-23 11:14:35 +01:00			`verbose=False):`
			`"""`
			`Optimizes the hyperparameters of a quantification method, based on an evaluation method and on an evaluation`
			`protocol for quantification.`
			`:param model: the quantifier to optimize`
			`:param training: the training set on which to optimize the hyperparameters`
			`:param validation: either a LabelledCollection on which to test the performance of the different settings, or`
			`a float in [0,1] indicating the proportion of labelled data to extract from the training set`
			`:param param_grid: a dictionary with keys the parameter names and values the list of values to explore for`
			`that particular parameter`
			`:param sample_size: the size of the samples to extract from the validation set`
			`:param n_prevpoints: if specified, indicates the number of equally distant point to extract from the interval`
			`[0,1] in order to define the prevalences of the samples; e.g., if n_prevpoints=5, then the prevalences for`
			`each class will be explored in [0.00, 0.25, 0.50, 0.75, 1.00]. If not specified, then eval_budget is requested`
			`:param n_repetitions: the number of repetitions for each combination of prevalences. This parameter is ignored`
			`if eval_budget is set and is lower than the number of combinations that would be generated using the value`
			`assigned to n_prevpoints (for the current number of classes and n_repetitions)`
			`:param eval_budget: if specified, sets a ceil on the number of evaluations to perform for each hyper-parameter`
			`combination. For example, if there are 3 classes, n_repetitions=1 and eval_budget=20, then n_prevpoints will be`
			`set to 5, since this will generate 15 different prevalences:`
			`[0, 0, 1], [0, 0.25, 0.75], [0, 0.5, 0.5] ... [1, 0, 0]`
			`:param error: an error function (callable) or a string indicating the name of an error function (valid ones`
			`are those in qp.error.QUANTIFICATION_ERROR`
			`:param refit: whether or not to refit the model on the whole labelled collection (training+validation) with`
			`the best chosen hyperparameter combination`
			`:param n_jobs: number of parallel jobs`
			`:param random_seed: set the seed of the random generator to replicate experiments`
setting a timeout for model_selection combinations in order to prevent some combinations to stuck the model selection 2021-01-15 17:42:19 +01:00			`:param timeout: establishes a timer (in seconds) for each of the hyperparameters configurations being tested.`
			`Whenever a run takes longer than this timer, that configuration will be ignored. If all configurations end up`
			`being ignored, a TimeoutError exception is raised. If -1 (default) then no time bound is set.`
model selection for quantification added 2020-12-23 11:14:35 +01:00			`:param verbose: set to True to get information through the stdout`
			`"""`
			`self.model = model`
			`self.param_grid = param_grid`
			`self.sample_size = sample_size`
			`self.n_prevpoints = n_prevpoints`
			`self.n_repetitions = n_repetitions`
			`self.eval_budget = eval_budget`
			`self.refit = refit`
refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size 2021-01-22 18:01:51 +01:00			`self.val_split = val_split`
model selection for quantification added 2020-12-23 11:14:35 +01:00			`self.n_jobs = n_jobs`
			`self.random_seed = random_seed`
setting a timeout for model_selection combinations in order to prevent some combinations to stuck the model selection 2021-01-15 17:42:19 +01:00			`self.timeout = timeout`
model selection for quantification added 2020-12-23 11:14:35 +01:00			`self.verbose = verbose`
			`self.__check_error(error)`

			`def sout(self, msg):`
			`if self.verbose:`
			`print(f'[{self.__class__.__name__}]: {msg}')`

			`def __check_training_validation(self, training, validation):`
			`if isinstance(validation, LabelledCollection):`
			`return training, validation`
			`elif isinstance(validation, float):`
			`assert 0. < validation < 1., 'validation proportion should be in (0,1)'`
			`training, validation = training.split_stratified(train_prop=1-validation)`
			`return training, validation`
			`else:`
import fixes 2021-01-15 18:32:32 +01:00			`raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'`
			`f'proportion of training documents to extract (found) {type(validation)}')`
model selection for quantification added 2020-12-23 11:14:35 +01:00
			`def __check_error(self, error):`
			`if error in qp.error.QUANTIFICATION_ERROR:`
			`self.error = error`
			`elif isinstance(error, str):`
parallel functionality added to quapy in order to allow for multiprocess parallelization (and not threading) handling quapy's environment variables 2021-01-27 09:54:41 +01:00			`self.error = qp.error.from_name(error)`
			`elif hasattr(error, '__call__'):`
			`self.error = error`
model selection for quantification added 2020-12-23 11:14:35 +01:00			`else:`
			`raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n'`
added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}')`
model selection for quantification added 2020-12-23 11:14:35 +01:00
refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size 2021-01-22 18:01:51 +01:00			`def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float]=None):`
model selection for quantification added 2020-12-23 11:14:35 +01:00			`"""`
			`:param training: the training set on which to optimize the hyperparameters`
refactor of ensembles, launching EPACC with Ptr policy 2021-01-19 18:26:40 +01:00			`:param val_split: either a LabelledCollection on which to test the performance of the different settings, or`
model selection for quantification added 2020-12-23 11:14:35 +01:00			`a float in [0,1] indicating the proportion of labelled data to extract from the training set`
			`"""`
refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size 2021-01-22 18:01:51 +01:00			`if val_split is None:`
			`val_split = self.val_split`
refactor of ensembles, launching EPACC with Ptr policy 2021-01-19 18:26:40 +01:00			`training, val_split = self.__check_training_validation(training, val_split)`
			`assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'`
model selection for quantification added 2020-12-23 11:14:35 +01:00
			`params_keys = list(self.param_grid.keys())`
			`params_values = list(self.param_grid.values())`

			`model = self.model`
			`n_jobs = self.n_jobs`

setting a timeout for model_selection combinations in order to prevent some combinations to stuck the model selection 2021-01-15 17:42:19 +01:00			`if self.timeout > 0:`
			`def handler(signum, frame):`
			`self.sout('timeout reached')`
			`raise TimeoutError()`
			`signal.signal(signal.SIGALRM, handler)`

model selection for quantification added 2020-12-23 11:14:35 +01:00			`self.sout(f'starting optimization with n_jobs={n_jobs}')`
			`self.param_scores_ = {}`
			`self.best_score_ = None`
setting a timeout for model_selection combinations in order to prevent some combinations to stuck the model selection 2021-01-15 17:42:19 +01:00			`some_timeouts = False`
model selection for quantification added 2020-12-23 11:14:35 +01:00			`for values in itertools.product(*params_values):`
refactor of ensembles, launching EPACC with Ptr policy 2021-01-19 18:26:40 +01:00			`params = dict({k: values[i] for i, k in enumerate(params_keys)})`
model selection for quantification added 2020-12-23 11:14:35 +01:00
setting a timeout for model_selection combinations in order to prevent some combinations to stuck the model selection 2021-01-15 17:42:19 +01:00			`if self.timeout > 0:`
			`signal.alarm(self.timeout)`

			`try:`
			`# overrides default parameters with the parameters being explored at this iteration`
			`model.set_params(**params)`
			`model.fit(training)`
			`true_prevalences, estim_prevalences = artificial_sampling_prediction(`
adding eval_budget to evaluation functions 2021-02-09 11:48:16 +01:00			`model, val_split, self.sample_size,`
			`n_prevpoints=self.n_prevpoints,`
			`n_repetitions=self.n_repetitions,`
			`eval_budget=self.eval_budget,`
			`n_jobs=n_jobs,`
			`random_seed=self.random_seed,`
refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size 2021-01-22 18:01:51 +01:00			`verbose=False`
setting a timeout for model_selection combinations in order to prevent some combinations to stuck the model selection 2021-01-15 17:42:19 +01:00			`)`

			`score = self.error(true_prevalences, estim_prevalences)`
			`self.sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}')`
			`if self.best_score_ is None or score < self.best_score_:`
			`self.best_score_ = score`
			`self.best_params_ = params`
			`if not self.refit:`
			`self.best_model_ = deepcopy(model)`
			`self.param_scores_[str(params)] = score`

			`if self.timeout > 0:`
			`signal.alarm(0)`
			`except TimeoutError:`
			`print(f'timeout reached for config {params}')`
			`some_timeouts = True`

			`if self.best_score_ is None and some_timeouts:`
			`raise TimeoutError('all jobs took more than the timeout time to end')`
model selection for quantification added 2020-12-23 11:14:35 +01:00
			`self.sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')`
			`model.set_params(**self.best_params_)`
			`self.best_model_ = deepcopy(model)`

			`if self.refit:`
			`self.sout(f'refitting on the whole development set')`
refactor of ensembles, launching EPACC with Ptr policy 2021-01-19 18:26:40 +01:00			`self.best_model_.fit(training + val_split)`
model selection for quantification added 2020-12-23 11:14:35 +01:00
added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`return self`

			`def quantify(self, instances):`
			`return self.best_model_.quantify(instances)`

			`def set_params(self, **parameters):`
			`self.param_grid = parameters`

			`def get_params(self, deep=True):`
			`return self.param_grid`
model selection for quantification added 2020-12-23 11:14:35 +01:00
adding tweet sent quant experiments 2021-01-11 18:31:12 +01:00			`def best_model(self):`
			`if hasattr(self, 'best_model_'):`
			`return self.best_model_`
			`raise ValueError('best_model called before fit')`