From ad11b86168ba09d7cb876c3f9cdba7f4f299aaec Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 30 May 2024 10:53:53 +0200 Subject: [PATCH] adding environment variables for N_JOBS, and adding a default classifier (sklearn's logistic regression) for when the classifier is not specified in aggregative quantifiers --- CHANGE_LOG.txt | 24 ++++++++++--- examples/0.basics.py | 5 +-- examples/1.model_selection.py | 16 +++++---- examples/8.ucimulti_experiments.py | 3 +- quapy/__init__.py | 24 +++++++++++-- quapy/method/_kdey.py | 55 +++++++++++++----------------- quapy/method/_threshold_optim.py | 14 ++++---- quapy/method/aggregative.py | 42 +++++++++++------------ quapy/model_selection.py | 2 +- 9 files changed, 108 insertions(+), 77 deletions(-) diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt index c3e1e64..8c99a6b 100644 --- a/CHANGE_LOG.txt +++ b/CHANGE_LOG.txt @@ -1,10 +1,26 @@ Change Log 0.1.9 ---------------- - [TODO] add LeQua2024 -- [TODO] add njobs to env -- [TODO] add basic examples -- [TODO] add default classifier to env -- [TODO] add default classifier to env + +- Added a default classifier for aggregative quantifiers, which now can be instantiated without specifying + the classifier. The default classifier can be accessed in qp.environ['DEFAULT_CLS'] and is assigned to + sklearn.linear_model.LogisticRegression(max_iter=3000). If the classifier is not specified, then a clone + of said classifier is returned. E.g.: + > pacc = PACC() + is equivalent to: + > pacc = PACC(classifier=LogisticRegression(max_iter=3000)) + +- Improved error loging in model selection. In v0.1.8 only Status.INVALID was reported; in v0.1.9 it is + now accompanied by a textual description of the error + +- The number of parallel workers can now be set via an environment variable by running, e.g.: + > N_JOBS=10 python3 your_script.py + which has the same effect as writing the following code at the beginning of your_script.py: + > import quapy as qp + > qp.environ["N_JOBS"] = 10 + +- Some examples have been added to the ./examples/ dir, which now contains numbered examples from basics (0) + to advanced topics (higher numbers) - Moved the wiki documents to the ./docs/ folder so that they become editable via PR for the community diff --git a/examples/0.basics.py b/examples/0.basics.py index ea3a7c1..6325434 100644 --- a/examples/0.basics.py +++ b/examples/0.basics.py @@ -33,9 +33,10 @@ import quapy.functional as F # <- this module has some functional utilities, li print(f'training prevalence = {F.strprev(train.prevalence())}') # let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier -classifier = LogisticRegression() +# classifier = LogisticRegression() -pacc = qp.method.aggregative.PACC(classifier) +# pacc = qp.method.aggregative.PACC(classifier) +pacc = qp.method.aggregative.PACC() print(f'training {pacc}') pacc.fit(train) diff --git a/examples/1.model_selection.py b/examples/1.model_selection.py index 130b542..61b7087 100644 --- a/examples/1.model_selection.py +++ b/examples/1.model_selection.py @@ -1,10 +1,7 @@ import quapy as qp -from method._kdey import KDEyML -from quapy.method.non_aggregative import DMx -from quapy.protocol import APP, UPP +from quapy.protocol import UPP from quapy.method.aggregative import DMy from sklearn.linear_model import LogisticRegression -from examples.comparing_gridsearch import OLD_GridSearchQ import numpy as np from time import time @@ -12,10 +9,15 @@ from time import time In this example, we show how to perform model selection on a DistributionMatching quantifier. """ -model = DMy(LogisticRegression()) +model = DMy() qp.environ['SAMPLE_SIZE'] = 100 -qp.environ['N_JOBS'] = -1 + +print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; ' + f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n' + f'alternatively, you can set this variable within the script as:\n' + f'import quapy as qp\n' + f'qp.environ["N_JOBS"]=-1') training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test @@ -42,7 +44,7 @@ with qp.util.temp_seed(0): # different configurations of the quantifier. In other words, quapy avoids to train # the classifier 7x7 times. param_grid = { - 'classifier__C': np.logspace(-3,3,7), + 'classifier__C': np.logspace(-3, 3, 7), 'nbins': [2, 3, 4, 5, 10, 15, 20] } diff --git a/examples/8.ucimulti_experiments.py b/examples/8.ucimulti_experiments.py index 5193376..e2a8d97 100644 --- a/examples/8.ucimulti_experiments.py +++ b/examples/8.ucimulti_experiments.py @@ -7,7 +7,7 @@ import numpy as np from sklearn.linear_model import LogisticRegression import quapy as qp -from quapy.method.aggregative import PACC, EMQ, KDEyML +from quapy.method.aggregative import PACC, EMQ from quapy.model_selection import GridSearchQ from quapy.protocol import UPP from pathlib import Path @@ -52,6 +52,7 @@ def load_timings(result_path): df = pd.read_csv(result_path+'.csv', sep='\t') return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict() + if __name__ == '__main__': qp.environ['SAMPLE_SIZE'] = 500 diff --git a/quapy/__init__.py b/quapy/__init__.py index 8e4ceb2..db34bbb 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -1,15 +1,18 @@ """QuaPy module for quantification""" +from sklearn.linear_model import LogisticRegression + from quapy.data import datasets from . import error from . import data from . import functional -# from . import method +from . import method from . import evaluation from . import protocol from . import plot from . import util from . import model_selection from . import classification +import os __version__ = '0.1.9' @@ -20,7 +23,8 @@ environ = { 'PAD_TOKEN': '[PAD]', 'PAD_INDEX': 1, 'SVMPERF_HOME': './svm_perf_quantification', - 'N_JOBS': 1 + 'N_JOBS': int(os.getenv('N_JOBS', 1)), + 'DEFAULT_CLS': LogisticRegression(max_iter=3000) } @@ -48,3 +52,19 @@ def _get_sample_size(sample_size): if sample_size is None: raise ValueError('neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified') return sample_size + + +def _get_classifier(classifier): + """ + If `classifier` is None, then it returns `environ['DEFAULT_CLS']`; + if otherwise, returns `classifier`. + + :param classifier: sklearn's estimator or None + :return: sklearn's estimator + """ + if classifier is None: + from sklearn.base import clone + classifier = clone(environ['DEFAULT_CLS']) + if classifier is None: + raise ValueError('neither classifier nor qp.environ["DEFAULT_CLS"] have been specified') + return classifier diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index e678563..e50c99c 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -24,12 +24,14 @@ class KDEBase: Checks that the bandwidth parameter is correct :param bandwidth: either a string (see BANDWIDTH_METHOD) or a float - :return: nothing, but raises an exception for invalid values + :return: the bandwidth if the check is passed, or raises an exception for invalid values """ assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' if isinstance(bandwidth, float): - assert 0 < bandwidth < 1, "the bandwith for KDEy should be in (0,1), since this method models the unit simplex" + assert 0 < bandwidth < 1, \ + "the bandwith for KDEy should be in (0,1), since this method models the unit simplex" + return bandwidth def get_kde_function(self, X, bandwidth): """ @@ -106,16 +108,13 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): Alternatively, this set can be specified at fit time by indicating the exact set of data on which the predictions are to be generated. :param bandwidth: float, the bandwidth of the Kernel - :param n_jobs: number of parallel workers :param random_state: a seed to be set before fitting any base quantifier (default None) """ - def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=None): - self._check_bandwidth(bandwidth) - self.classifier = classifier + def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None): + self.classifier = qp._get_classifier(classifier) self.val_split = val_split - self.bandwidth = bandwidth - self.n_jobs = n_jobs + self.bandwidth = KDEBase._check_bandwidth(bandwidth) self.random_state=random_state def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): @@ -130,17 +129,17 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): :param posteriors: instances in the sample converted into posterior probabilities :return: a vector of class prevalence estimates """ - np.random.RandomState(self.random_state) - epsilon = 1e-10 - n_classes = len(self.mix_densities) - test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities] + with qp.util.temp_seed(self.random_state): + epsilon = 1e-10 + n_classes = len(self.mix_densities) + test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities] - def neg_loglikelihood(prev): - test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) - test_loglikelihood = np.log(test_mixture_likelihood + epsilon) - return -np.sum(test_loglikelihood) + def neg_loglikelihood(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) - return F.optim_minimize(neg_loglikelihood, n_classes) + return F.optim_minimize(neg_loglikelihood, n_classes) class KDEyHD(AggregativeSoftQuantifier, KDEBase): @@ -183,20 +182,17 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase): Alternatively, this set can be specified at fit time by indicating the exact set of data on which the predictions are to be generated. :param bandwidth: float, the bandwidth of the Kernel - :param n_jobs: number of parallel workers :param random_state: a seed to be set before fitting any base quantifier (default None) :param montecarlo_trials: number of Monte Carlo trials (default 10000) """ - def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD', - bandwidth=0.1, n_jobs=None, random_state=None, montecarlo_trials=10000): + def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD', + bandwidth=0.1, random_state=None, montecarlo_trials=10000): - self._check_bandwidth(bandwidth) - self.classifier = classifier + self.classifier = qp._get_classifier(classifier) self.val_split = val_split self.divergence = divergence - self.bandwidth = bandwidth - self.n_jobs = n_jobs + self.bandwidth = KDEBase._check_bandwidth(bandwidth) self.random_state=random_state self.montecarlo_trials = montecarlo_trials @@ -278,15 +274,12 @@ class KDEyCS(AggregativeSoftQuantifier): Alternatively, this set can be specified at fit time by indicating the exact set of data on which the predictions are to be generated. :param bandwidth: float, the bandwidth of the Kernel - :param n_jobs: number of parallel workers """ - def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None): - KDEBase._check_bandwidth(bandwidth) - self.classifier = classifier + def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1): + self.classifier = qp._get_classifier(classifier) self.val_split = val_split - self.bandwidth = bandwidth - self.n_jobs = n_jobs + self.bandwidth = KDEBase._check_bandwidth(bandwidth) def gram_matrix_mix_sum(self, X, Y=None): # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y)) @@ -355,7 +348,7 @@ class KDEyCS(AggregativeSoftQuantifier): # called \overline{r} in the paper alpha_ratio = alpha * self.counts_inv - # recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M) + # recall that tr_te_sums already accounts for the constant terms (1/Li)*(1/M) partA = -np.log((alpha_ratio @ tr_te_sums) * Minv) partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio) return partA + partB #+ partC diff --git a/quapy/method/_threshold_optim.py b/quapy/method/_threshold_optim.py index a9d2723..72d0c95 100644 --- a/quapy/method/_threshold_optim.py +++ b/quapy/method/_threshold_optim.py @@ -27,8 +27,8 @@ class ThresholdOptimization(BinaryAggregativeQuantifier): :class:`quapy.data.base.LabelledCollection` (the split itself). """ - def __init__(self, classifier: BaseEstimator, val_split=None, n_jobs=None): - self.classifier = classifier + def __init__(self, classifier: BaseEstimator=None, val_split=None, n_jobs=None): + self.classifier = qp._get_classifier(classifier) self.val_split = val_split self.n_jobs = qp._get_njobs(n_jobs) @@ -143,7 +143,7 @@ class T50(ThresholdOptimization): :class:`quapy.data.base.LabelledCollection` (the split itself). """ - def __init__(self, classifier: BaseEstimator, val_split=5): + def __init__(self, classifier: BaseEstimator=None, val_split=5): super().__init__(classifier, val_split) def condition(self, tpr, fpr) -> float: @@ -167,7 +167,7 @@ class MAX(ThresholdOptimization): :class:`quapy.data.base.LabelledCollection` (the split itself). """ - def __init__(self, classifier: BaseEstimator, val_split=5): + def __init__(self, classifier: BaseEstimator=None, val_split=5): super().__init__(classifier, val_split) def condition(self, tpr, fpr) -> float: @@ -192,7 +192,7 @@ class X(ThresholdOptimization): :class:`quapy.data.base.LabelledCollection` (the split itself). """ - def __init__(self, classifier: BaseEstimator, val_split=5): + def __init__(self, classifier: BaseEstimator=None, val_split=5): super().__init__(classifier, val_split) def condition(self, tpr, fpr) -> float: @@ -215,7 +215,7 @@ class MS(ThresholdOptimization): `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a :class:`quapy.data.base.LabelledCollection` (the split itself). """ - def __init__(self, classifier: BaseEstimator, val_split=5): + def __init__(self, classifier: BaseEstimator=None, val_split=5): super().__init__(classifier, val_split) def condition(self, tpr, fpr) -> float: @@ -254,7 +254,7 @@ class MS2(MS): `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a :class:`quapy.data.base.LabelledCollection` (the split itself). """ - def __init__(self, classifier: BaseEstimator, val_split=5): + def __init__(self, classifier: BaseEstimator=None, val_split=5): super().__init__(classifier, val_split) def discard(self, tpr, fpr) -> bool: diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 81165c2..c7a9dbd 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -3,7 +3,6 @@ from copy import deepcopy from typing import Callable, Literal, Union import numpy as np from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling -from scipy import optimize from sklearn.base import BaseEstimator from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import confusion_matrix @@ -12,7 +11,6 @@ from sklearn.model_selection import cross_val_predict import quapy as qp import quapy.functional as F from quapy.functional import get_divergence -from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration from quapy.classification.svmperf import SVMperf from quapy.data import LabelledCollection from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric @@ -343,8 +341,8 @@ class CC(AggregativeCrispQuantifier): :param classifier: a sklearn's Estimator that generates a classifier """ - def __init__(self, classifier: BaseEstimator): - self.classifier = classifier + def __init__(self, classifier: BaseEstimator=None): + self.classifier = qp._get_classifier(classifier) def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): """ @@ -373,8 +371,8 @@ class PCC(AggregativeSoftQuantifier): :param classifier: a sklearn's Estimator that generates a classifier """ - def __init__(self, classifier: BaseEstimator): - self.classifier = classifier + def __init__(self, classifier: BaseEstimator=None): + self.classifier = qp._get_classifier(classifier) def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): """ @@ -436,14 +434,14 @@ class ACC(AggregativeCrispQuantifier): """ def __init__( self, - classifier: BaseEstimator, + classifier: BaseEstimator=None, val_split=5, solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize', method: Literal['inversion', 'invariant-ratio'] = 'inversion', norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip', n_jobs=None, ): - self.classifier = classifier + self.classifier = qp._get_classifier(classifier) self.val_split = val_split self.n_jobs = qp._get_njobs(n_jobs) self.solver = solver @@ -571,14 +569,14 @@ class PACC(AggregativeSoftQuantifier): """ def __init__( self, - classifier: BaseEstimator, + classifier: BaseEstimator=None, val_split=5, solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize', method: Literal['inversion', 'invariant-ratio'] = 'inversion', norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip', n_jobs=None ): - self.classifier = classifier + self.classifier = qp._get_classifier(classifier) self.val_split = val_split self.n_jobs = qp._get_njobs(n_jobs) self.solver = solver @@ -668,8 +666,8 @@ class EMQ(AggregativeSoftQuantifier): MAX_ITER = 1000 EPSILON = 1e-4 - def __init__(self, classifier: BaseEstimator, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None): - self.classifier = classifier + def __init__(self, classifier: BaseEstimator=None, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None): + self.classifier = qp._get_classifier(classifier) self.val_split = val_split self.exact_train_prev = exact_train_prev self.recalib = recalib @@ -832,7 +830,7 @@ class BayesianCC(AggregativeCrispQuantifier): :param mcmc_seed: random seed for the MCMC sampler (default 0) """ def __init__(self, - classifier: BaseEstimator, + classifier: BaseEstimator=None, val_split: float = 0.75, num_warmup: int = 500, num_samples: int = 1_000, @@ -849,7 +847,7 @@ class BayesianCC(AggregativeCrispQuantifier): if _bayesian.DEPENDENCIES_INSTALLED is False: raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.") - self.classifier = classifier + self.classifier = qp._get_classifier(classifier) self.val_split = val_split self.num_warmup = num_warmup self.num_samples = num_samples @@ -919,8 +917,8 @@ class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5).. """ - def __init__(self, classifier: BaseEstimator, val_split=5): - self.classifier = classifier + def __init__(self, classifier: BaseEstimator=None, val_split=5): + self.classifier = qp._get_classifier(classifier) self.val_split = val_split def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): @@ -995,8 +993,8 @@ class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): :param n_jobs: number of parallel workers. """ - def __init__(self, classifier: BaseEstimator, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None): - self.classifier = classifier + def __init__(self, classifier: BaseEstimator=None, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None): + self.classifier = qp._get_classifier(classifier) self.val_split = val_split self.tol = tol self.divergence = divergence @@ -1060,8 +1058,8 @@ class SMM(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5).. """ - def __init__(self, classifier: BaseEstimator, val_split=5): - self.classifier = classifier + def __init__(self, classifier: BaseEstimator=None, val_split=5): + self.classifier = qp._get_classifier(classifier) self.val_split = val_split def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): @@ -1109,9 +1107,9 @@ class DMy(AggregativeSoftQuantifier): :param n_jobs: number of parallel workers (default None) """ - def __init__(self, classifier, val_split=5, nbins=8, divergence: Union[str, Callable]='HD', + def __init__(self, classifier: BaseEstimator=None, val_split=5, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, search='optim_minimize', n_jobs=None): - self.classifier = classifier + self.classifier = qp._get_classifier(classifier) self.val_split = val_split self.nbins = nbins self.divergence = divergence diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 107f303..0c600ca 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -328,7 +328,7 @@ class GridSearchQ(BaseQuantifier): if self.raise_errors: raise exception else: - return ConfigStatus(params, status) + return ConfigStatus(params, status, msg=str(exception)) try: with timeout(self.timeout):