adding environment variables for N_JOBS, and adding a default classifier (sklearn's logistic regression) for when the classifier is not specified in aggregative quantifiers

2024-05-30 10:53:53 +02:00 · 2024-05-30 10:53:53 +02:00 · ad11b86168
parent 9ad36ef008
commit ad11b86168
9 changed files with 108 additions and 77 deletions
--- a/CHANGE_LOG.txt
+++ b/CHANGE_LOG.txt
@ -1,10 +1,26 @@
 Change Log 0.1.9
 ----------------
 - [TODO] add LeQua2024
- [TODO] add njobs to env
+
- [TODO] add basic examples
+- Added a default classifier for aggregative quantifiers, which now can be instantiated without specifying
- [TODO] add default classifier to env
+    the classifier. The default classifier can be accessed in qp.environ['DEFAULT_CLS'] and is assigned to
- [TODO] add default classifier to env
+    sklearn.linear_model.LogisticRegression(max_iter=3000). If the classifier is not specified, then a clone
    of said classifier is returned. E.g.:
    > pacc = PACC()
    is equivalent to:
    > pacc = PACC(classifier=LogisticRegression(max_iter=3000))
 - Improved error loging in model selection. In v0.1.8 only Status.INVALID was reported; in v0.1.9 it is
    now accompanied by a textual description of the error
 - The number of parallel workers can now be set via an environment variable by running, e.g.:
    > N_JOBS=10 python3 your_script.py
    which has the same effect as writing the following code at the beginning of your_script.py:
    > import quapy as qp
    > qp.environ["N_JOBS"] = 10
 - Some examples have been added to the ./examples/ dir, which now contains numbered examples from basics (0)
    to advanced topics (higher numbers)
 - Moved the wiki documents to the ./docs/ folder so that they become editable via PR for the community
--- a/examples/0.basics.py
+++ b/examples/0.basics.py
@ -33,9 +33,10 @@ import quapy.functional as F  # <- this module has some functional utilities, li
 print(f'training prevalence = {F.strprev(train.prevalence())}')
 # let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier
-classifier = LogisticRegression()
+# classifier = LogisticRegression()
-pacc = qp.method.aggregative.PACC(classifier)
+# pacc = qp.method.aggregative.PACC(classifier)
 pacc = qp.method.aggregative.PACC()
 print(f'training {pacc}')
 pacc.fit(train)
--- a/examples/1.model_selection.py
+++ b/examples/1.model_selection.py
@ -1,10 +1,7 @@
 import quapy as qp
-from method._kdey import KDEyML
+from quapy.protocol import UPP
 from quapy.method.non_aggregative import DMx
 from quapy.protocol import APP, UPP
 from quapy.method.aggregative import DMy
 from sklearn.linear_model import LogisticRegression
 from examples.comparing_gridsearch import OLD_GridSearchQ
 import numpy as np
 from time import time
@ -12,10 +9,15 @@ from time import time
 In this example, we show how to perform model selection on a DistributionMatching quantifier.
 """
-model = DMy(LogisticRegression())
+model = DMy()
 qp.environ['SAMPLE_SIZE'] = 100
-qp.environ['N_JOBS'] = -1
+
 print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
      f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n'
      f'alternatively, you can set this variable within the script as:\n'
      f'import quapy as qp\n'
      f'qp.environ["N_JOBS"]=-1')
 training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test
@ -42,7 +44,7 @@ with qp.util.temp_seed(0):
    # different configurations of the quantifier. In other words, quapy avoids to train
    # the classifier 7x7 times.
    param_grid = {
-        'classifier__C': np.logspace(-3,3,7),
+        'classifier__C': np.logspace(-3, 3, 7),
        'nbins': [2, 3, 4, 5, 10, 15, 20]
    }
--- a/examples/8.ucimulti_experiments.py
+++ b/examples/8.ucimulti_experiments.py
@ -7,7 +7,7 @@ import numpy as np
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
-from quapy.method.aggregative import PACC, EMQ, KDEyML
+from quapy.method.aggregative import PACC, EMQ
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 from pathlib import Path
@ -52,6 +52,7 @@ def load_timings(result_path):
    df = pd.read_csv(result_path+'.csv', sep='\t')
    return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict()
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 500
--- a/quapy/init.py
+++ b/quapy/init.py
@ -1,15 +1,18 @@
 """QuaPy module for quantification"""
 from sklearn.linear_model import LogisticRegression
 from quapy.data import datasets
 from . import error
 from . import data
 from . import functional
-# from . import method
+from . import method
 from . import evaluation
 from . import protocol
 from . import plot
 from . import util
 from . import model_selection
 from . import classification
 import os
 __version__ = '0.1.9'
@ -20,7 +23,8 @@ environ = {
    'PAD_TOKEN': '[PAD]',
    'PAD_INDEX': 1,
    'SVMPERF_HOME': './svm_perf_quantification',
-    'N_JOBS': 1
+    'N_JOBS': int(os.getenv('N_JOBS', 1)),
    'DEFAULT_CLS': LogisticRegression(max_iter=3000)
 }
@ -48,3 +52,19 @@ def _get_sample_size(sample_size):
    if sample_size is None:
        raise ValueError('neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')
    return sample_size
 def _get_classifier(classifier):
    """
    If `classifier` is None, then it returns `environ['DEFAULT_CLS']`;
    if otherwise, returns `classifier`.
    :param classifier: sklearn's estimator or None
    :return: sklearn's estimator
    """
    if classifier is None:
        from sklearn.base import clone
        classifier = clone(environ['DEFAULT_CLS'])
    if classifier is None:
        raise ValueError('neither classifier nor qp.environ["DEFAULT_CLS"] have been specified')
    return classifier
--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@ -24,12 +24,14 @@ class KDEBase:
        Checks that the bandwidth parameter is correct
        :param bandwidth: either a string (see BANDWIDTH_METHOD) or a float
-        :return: nothing, but raises an exception for invalid values
+        :return: the bandwidth if the check is passed, or raises an exception for invalid values
        """
        assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
            f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
        if isinstance(bandwidth, float):
-            assert 0 < bandwidth < 1,  "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
+            assert 0 < bandwidth < 1,  \
                "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
        return bandwidth
    def get_kde_function(self, X, bandwidth):
        """
@ -106,16 +108,13 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
        Alternatively, this set can be specified at fit time by indicating the exact set of data
        on which the predictions are to be generated.
    :param bandwidth: float, the bandwidth of the Kernel
    :param n_jobs: number of parallel workers
    :param random_state: a seed to be set before fitting any base quantifier (default None)
    """
-    def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=None):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None):
-        self._check_bandwidth(bandwidth)
+        self.classifier = qp._get_classifier(classifier)
        self.classifier = classifier
        self.val_split = val_split
-        self.bandwidth = bandwidth
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
        self.n_jobs = n_jobs
        self.random_state=random_state
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@ -130,17 +129,17 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
        :param posteriors: instances in the sample converted into posterior probabilities
        :return: a vector of class prevalence estimates
        """
-        np.random.RandomState(self.random_state)
+        with qp.util.temp_seed(self.random_state):
-        epsilon = 1e-10
+            epsilon = 1e-10
-        n_classes = len(self.mix_densities)
+            n_classes = len(self.mix_densities)
-        test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
+            test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
-        def neg_loglikelihood(prev):
+            def neg_loglikelihood(prev):
-            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
+                test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
-            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
+                test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
-            return  -np.sum(test_loglikelihood)
+                return  -np.sum(test_loglikelihood)
-        return F.optim_minimize(neg_loglikelihood, n_classes)
+            return F.optim_minimize(neg_loglikelihood, n_classes)
 class KDEyHD(AggregativeSoftQuantifier, KDEBase):
@ -183,20 +182,17 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
        Alternatively, this set can be specified at fit time by indicating the exact set of data
        on which the predictions are to be generated.
    :param bandwidth: float, the bandwidth of the Kernel
    :param n_jobs: number of parallel workers
    :param random_state: a seed to be set before fitting any base quantifier (default None)
    :param montecarlo_trials: number of Monte Carlo trials (default 10000)
    """
-    def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD',
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD',
-                 bandwidth=0.1, n_jobs=None, random_state=None, montecarlo_trials=10000):
+                 bandwidth=0.1, random_state=None, montecarlo_trials=10000):
-        self._check_bandwidth(bandwidth)
+        self.classifier = qp._get_classifier(classifier)
        self.classifier = classifier
        self.val_split = val_split
        self.divergence = divergence
-        self.bandwidth = bandwidth
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
        self.n_jobs = n_jobs
        self.random_state=random_state
        self.montecarlo_trials = montecarlo_trials
@ -278,15 +274,12 @@ class KDEyCS(AggregativeSoftQuantifier):
        Alternatively, this set can be specified at fit time by indicating the exact set of data
        on which the predictions are to be generated.
    :param bandwidth: float, the bandwidth of the Kernel
    :param n_jobs: number of parallel workers
    """
-    def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1):
-        KDEBase._check_bandwidth(bandwidth)
+        self.classifier = qp._get_classifier(classifier)
        self.classifier = classifier
        self.val_split = val_split
-        self.bandwidth = bandwidth
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
        self.n_jobs = n_jobs
    def gram_matrix_mix_sum(self, X, Y=None):
        # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
@ -355,7 +348,7 @@ class KDEyCS(AggregativeSoftQuantifier):
            # called \overline{r} in the paper
            alpha_ratio = alpha * self.counts_inv
-            # recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
+            # recall that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
            partA = -np.log((alpha_ratio @ tr_te_sums) * Minv)
            partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio)
            return partA + partB #+ partC
--- a/quapy/method/_threshold_optim.py
+++ b/quapy/method/_threshold_optim.py
@ -27,8 +27,8 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """
-    def __init__(self, classifier: BaseEstimator, val_split=None, n_jobs=None):
+    def __init__(self, classifier: BaseEstimator=None, val_split=None, n_jobs=None):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.n_jobs = qp._get_njobs(n_jobs)
@ -143,7 +143,7 @@ class T50(ThresholdOptimization):
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
        super().__init__(classifier, val_split)
    def condition(self, tpr, fpr) -> float:
@ -167,7 +167,7 @@ class MAX(ThresholdOptimization):
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
        super().__init__(classifier, val_split)
    def condition(self, tpr, fpr) -> float:
@ -192,7 +192,7 @@ class X(ThresholdOptimization):
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
        super().__init__(classifier, val_split)
    def condition(self, tpr, fpr) -> float:
@ -215,7 +215,7 @@ class MS(ThresholdOptimization):
        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
        super().__init__(classifier, val_split)
    def condition(self, tpr, fpr) -> float:
@ -254,7 +254,7 @@ class MS2(MS):
        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
        super().__init__(classifier, val_split)
    def discard(self, tpr, fpr) -> bool:
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -3,7 +3,6 @@ from copy import deepcopy
 from typing import Callable, Literal, Union
 import numpy as np
 from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
 from scipy import optimize
 from sklearn.base import BaseEstimator
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.metrics import confusion_matrix
@ -12,7 +11,6 @@ from sklearn.model_selection import cross_val_predict
 import quapy as qp
 import quapy.functional as F
 from quapy.functional import get_divergence
 from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
 from quapy.classification.svmperf import SVMperf
 from quapy.data import LabelledCollection
 from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric
@ -343,8 +341,8 @@ class CC(AggregativeCrispQuantifier):
    :param classifier: a sklearn's Estimator that generates a classifier
    """
-    def __init__(self, classifier: BaseEstimator):
+    def __init__(self, classifier: BaseEstimator=None):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
        """
@ -373,8 +371,8 @@ class PCC(AggregativeSoftQuantifier):
    :param classifier: a sklearn's Estimator that generates a classifier
    """
-    def __init__(self, classifier: BaseEstimator):
+    def __init__(self, classifier: BaseEstimator=None):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
        """
@ -436,14 +434,14 @@ class ACC(AggregativeCrispQuantifier):
    """
    def __init__(
            self,
-            classifier: BaseEstimator,
+            classifier: BaseEstimator=None,
            val_split=5,
            solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize',
            method: Literal['inversion', 'invariant-ratio'] = 'inversion',
            norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip',
            n_jobs=None,
    ):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.n_jobs = qp._get_njobs(n_jobs)
        self.solver = solver
@ -571,14 +569,14 @@ class PACC(AggregativeSoftQuantifier):
    """
    def __init__(
            self,
-            classifier: BaseEstimator,
+            classifier: BaseEstimator=None,
            val_split=5,
            solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize',
            method: Literal['inversion', 'invariant-ratio'] = 'inversion',
            norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip',
            n_jobs=None
    ):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.n_jobs = qp._get_njobs(n_jobs)
        self.solver = solver
@ -668,8 +666,8 @@ class EMQ(AggregativeSoftQuantifier):
    MAX_ITER = 1000
    EPSILON = 1e-4
-    def __init__(self, classifier: BaseEstimator, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None):
+    def __init__(self, classifier: BaseEstimator=None, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.exact_train_prev = exact_train_prev
        self.recalib = recalib
@ -832,7 +830,7 @@ class BayesianCC(AggregativeCrispQuantifier):
    :param mcmc_seed: random seed for the MCMC sampler (default 0)
    """
    def __init__(self,
-                 classifier: BaseEstimator,
+                 classifier: BaseEstimator=None,
                 val_split: float = 0.75,
                 num_warmup: int = 500,
                 num_samples: int = 1_000,
@ -849,7 +847,7 @@ class BayesianCC(AggregativeCrispQuantifier):
        if _bayesian.DEPENDENCIES_INSTALLED is False:
            raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.num_warmup = num_warmup
        self.num_samples = num_samples
@ -919,8 +917,8 @@ class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
        validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
    """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@ -995,8 +993,8 @@ class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
    :param n_jobs: number of parallel workers.
    """
-    def __init__(self, classifier: BaseEstimator, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.tol = tol
        self.divergence = divergence
@ -1060,8 +1058,8 @@ class SMM(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
        validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
    """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@ -1109,9 +1107,9 @@ class DMy(AggregativeSoftQuantifier):
    :param n_jobs: number of parallel workers (default None)
    """
-    def __init__(self, classifier, val_split=5, nbins=8, divergence: Union[str, Callable]='HD',
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, nbins=8, divergence: Union[str, Callable]='HD',
                 cdf=False, search='optim_minimize', n_jobs=None):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.nbins = nbins
        self.divergence = divergence
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -328,7 +328,7 @@ class GridSearchQ(BaseQuantifier):
            if self.raise_errors:
                raise exception
            else:
-                return ConfigStatus(params, status)
+                return ConfigStatus(params, status, msg=str(exception))
        try:
            with timeout(self.timeout):