adding environment variables for N_JOBS, and adding a default classifier (sklearn's logistic regression) for when the classifier is not specified in aggregative quantifiers

2024-05-30 10:53:53 +02:00 · 2024-05-30 10:53:53 +02:00 · ad11b86168
parent 9ad36ef008
commit ad11b86168
9 changed files with 108 additions and 77 deletions
--- a/CHANGE_LOG.txt
+++ b/CHANGE_LOG.txt
@ -1,10 +1,26 @@
 Change Log 0.1.9
 ----------------
 - [TODO] add LeQua2024
- [TODO] add njobs to env
- [TODO] add basic examples
- [TODO] add default classifier to env
- [TODO] add default classifier to env
+
+- Added a default classifier for aggregative quantifiers, which now can be instantiated without specifying
+    the classifier. The default classifier can be accessed in qp.environ['DEFAULT_CLS'] and is assigned to
+    sklearn.linear_model.LogisticRegression(max_iter=3000). If the classifier is not specified, then a clone
+    of said classifier is returned. E.g.:
+    > pacc = PACC()
+    is equivalent to:
+    > pacc = PACC(classifier=LogisticRegression(max_iter=3000))
+
+- Improved error loging in model selection. In v0.1.8 only Status.INVALID was reported; in v0.1.9 it is
+    now accompanied by a textual description of the error
+
+- The number of parallel workers can now be set via an environment variable by running, e.g.:
+    > N_JOBS=10 python3 your_script.py
+    which has the same effect as writing the following code at the beginning of your_script.py:
+    > import quapy as qp
+    > qp.environ["N_JOBS"] = 10
+
+- Some examples have been added to the ./examples/ dir, which now contains numbered examples from basics (0)
+    to advanced topics (higher numbers)

 - Moved the wiki documents to the ./docs/ folder so that they become editable via PR for the community

--- a/examples/0.basics.py
+++ b/examples/0.basics.py
@ -33,9 +33,10 @@ import quapy.functional as F  # <- this module has some functional utilities, li
 print(f'training prevalence = {F.strprev(train.prevalence())}')

 # let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier
-classifier = LogisticRegression()
+# classifier = LogisticRegression()

-pacc = qp.method.aggregative.PACC(classifier)
+# pacc = qp.method.aggregative.PACC(classifier)
+pacc = qp.method.aggregative.PACC()

 print(f'training {pacc}')
 pacc.fit(train)
--- a/examples/1.model_selection.py
+++ b/examples/1.model_selection.py
@ -1,10 +1,7 @@
 import quapy as qp
-from method._kdey import KDEyML
-from quapy.method.non_aggregative import DMx
-from quapy.protocol import APP, UPP
+from quapy.protocol import UPP
 from quapy.method.aggregative import DMy
 from sklearn.linear_model import LogisticRegression
-from examples.comparing_gridsearch import OLD_GridSearchQ
 import numpy as np
 from time import time

@ -12,10 +9,15 @@ from time import time
 In this example, we show how to perform model selection on a DistributionMatching quantifier.
 """

-model = DMy(LogisticRegression())
+model = DMy()

 qp.environ['SAMPLE_SIZE'] = 100
-qp.environ['N_JOBS'] = -1
+
+print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
+      f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n'
+      f'alternatively, you can set this variable within the script as:\n'
+      f'import quapy as qp\n'
+      f'qp.environ["N_JOBS"]=-1')

 training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test

@ -42,7 +44,7 @@ with qp.util.temp_seed(0):
    # different configurations of the quantifier. In other words, quapy avoids to train
    # the classifier 7x7 times.
    param_grid = {
-        'classifier__C': np.logspace(-3,3,7),
+        'classifier__C': np.logspace(-3, 3, 7),
        'nbins': [2, 3, 4, 5, 10, 15, 20]
    }

--- a/examples/8.ucimulti_experiments.py
+++ b/examples/8.ucimulti_experiments.py
@ -7,7 +7,7 @@ import numpy as np
 from sklearn.linear_model import LogisticRegression

 import quapy as qp
-from quapy.method.aggregative import PACC, EMQ, KDEyML
+from quapy.method.aggregative import PACC, EMQ
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 from pathlib import Path
@ -52,6 +52,7 @@ def load_timings(result_path):
    df = pd.read_csv(result_path+'.csv', sep='\t')
    return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict()

+
 if __name__ == '__main__':

    qp.environ['SAMPLE_SIZE'] = 500
--- a/quapy/init.py
+++ b/quapy/init.py
@ -1,15 +1,18 @@
 """QuaPy module for quantification"""
+from sklearn.linear_model import LogisticRegression
+
 from quapy.data import datasets
 from . import error
 from . import data
 from . import functional
-# from . import method
+from . import method
 from . import evaluation
 from . import protocol
 from . import plot
 from . import util
 from . import model_selection
 from . import classification
+import os

 __version__ = '0.1.9'

@ -20,7 +23,8 @@ environ = {
    'PAD_TOKEN': '[PAD]',
    'PAD_INDEX': 1,
    'SVMPERF_HOME': './svm_perf_quantification',
-    'N_JOBS': 1
+    'N_JOBS': int(os.getenv('N_JOBS', 1)),
+    'DEFAULT_CLS': LogisticRegression(max_iter=3000)
 }


@ -48,3 +52,19 @@ def _get_sample_size(sample_size):
    if sample_size is None:
        raise ValueError('neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')
    return sample_size
+
+
+def _get_classifier(classifier):
+    """
+    If `classifier` is None, then it returns `environ['DEFAULT_CLS']`;
+    if otherwise, returns `classifier`.
+
+    :param classifier: sklearn's estimator or None
+    :return: sklearn's estimator
+    """
+    if classifier is None:
+        from sklearn.base import clone
+        classifier = clone(environ['DEFAULT_CLS'])
+    if classifier is None:
+        raise ValueError('neither classifier nor qp.environ["DEFAULT_CLS"] have been specified')
+    return classifier
--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@ -24,12 +24,14 @@ class KDEBase:
        Checks that the bandwidth parameter is correct

        :param bandwidth: either a string (see BANDWIDTH_METHOD) or a float
-        :return: nothing, but raises an exception for invalid values
+        :return: the bandwidth if the check is passed, or raises an exception for invalid values
        """
        assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
            f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
        if isinstance(bandwidth, float):
-            assert 0 < bandwidth < 1,  "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
+            assert 0 < bandwidth < 1,  \
+                "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
+        return bandwidth

    def get_kde_function(self, X, bandwidth):
        """
@ -106,16 +108,13 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
        Alternatively, this set can be specified at fit time by indicating the exact set of data
        on which the predictions are to be generated.
    :param bandwidth: float, the bandwidth of the Kernel
-    :param n_jobs: number of parallel workers
    :param random_state: a seed to be set before fitting any base quantifier (default None)
    """

-    def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=None):
-        self._check_bandwidth(bandwidth)
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None):
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
-        self.bandwidth = bandwidth
-        self.n_jobs = n_jobs
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
        self.random_state=random_state

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@ -130,17 +129,17 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
        :param posteriors: instances in the sample converted into posterior probabilities
        :return: a vector of class prevalence estimates
        """
-        np.random.RandomState(self.random_state)
-        epsilon = 1e-10
-        n_classes = len(self.mix_densities)
-        test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
+        with qp.util.temp_seed(self.random_state):
+            epsilon = 1e-10
+            n_classes = len(self.mix_densities)
+            test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]

-        def neg_loglikelihood(prev):
-            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
-            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
-            return  -np.sum(test_loglikelihood)
+            def neg_loglikelihood(prev):
+                test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
+                test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
+                return  -np.sum(test_loglikelihood)

-        return F.optim_minimize(neg_loglikelihood, n_classes)
+            return F.optim_minimize(neg_loglikelihood, n_classes)


 class KDEyHD(AggregativeSoftQuantifier, KDEBase):
@ -183,20 +182,17 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
        Alternatively, this set can be specified at fit time by indicating the exact set of data
        on which the predictions are to be generated.
    :param bandwidth: float, the bandwidth of the Kernel
-    :param n_jobs: number of parallel workers
    :param random_state: a seed to be set before fitting any base quantifier (default None)
    :param montecarlo_trials: number of Monte Carlo trials (default 10000)
    """

-    def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD',
-                 bandwidth=0.1, n_jobs=None, random_state=None, montecarlo_trials=10000):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD',
+                 bandwidth=0.1, random_state=None, montecarlo_trials=10000):
        
-        self._check_bandwidth(bandwidth)
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.divergence = divergence
-        self.bandwidth = bandwidth
-        self.n_jobs = n_jobs
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
        self.random_state=random_state
        self.montecarlo_trials = montecarlo_trials

@ -278,15 +274,12 @@ class KDEyCS(AggregativeSoftQuantifier):
        Alternatively, this set can be specified at fit time by indicating the exact set of data
        on which the predictions are to be generated.
    :param bandwidth: float, the bandwidth of the Kernel
-    :param n_jobs: number of parallel workers
    """

-    def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None):
-        KDEBase._check_bandwidth(bandwidth)
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1):
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
-        self.bandwidth = bandwidth
-        self.n_jobs = n_jobs
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)

    def gram_matrix_mix_sum(self, X, Y=None):
        # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
@ -355,7 +348,7 @@ class KDEyCS(AggregativeSoftQuantifier):
            # called \overline{r} in the paper
            alpha_ratio = alpha * self.counts_inv

-            # recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
+            # recall that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
            partA = -np.log((alpha_ratio @ tr_te_sums) * Minv)
            partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio)
            return partA + partB #+ partC
--- a/quapy/method/_threshold_optim.py
+++ b/quapy/method/_threshold_optim.py
@ -27,8 +27,8 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """

-    def __init__(self, classifier: BaseEstimator, val_split=None, n_jobs=None):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=None, n_jobs=None):
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.n_jobs = qp._get_njobs(n_jobs)

@ -143,7 +143,7 @@ class T50(ThresholdOptimization):
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """

-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
        super().__init__(classifier, val_split)

    def condition(self, tpr, fpr) -> float:
@ -167,7 +167,7 @@ class MAX(ThresholdOptimization):
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """

-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
        super().__init__(classifier, val_split)

    def condition(self, tpr, fpr) -> float:
@ -192,7 +192,7 @@ class X(ThresholdOptimization):
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """

-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
        super().__init__(classifier, val_split)

    def condition(self, tpr, fpr) -> float:
@ -215,7 +215,7 @@ class MS(ThresholdOptimization):
        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
        super().__init__(classifier, val_split)

    def condition(self, tpr, fpr) -> float:
@ -254,7 +254,7 @@ class MS2(MS):
        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
        super().__init__(classifier, val_split)

    def discard(self, tpr, fpr) -> bool:
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -3,7 +3,6 @@ from copy import deepcopy
 from typing import Callable, Literal, Union
 import numpy as np
 from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
-from scipy import optimize
 from sklearn.base import BaseEstimator
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.metrics import confusion_matrix
@ -12,7 +11,6 @@ from sklearn.model_selection import cross_val_predict
 import quapy as qp
 import quapy.functional as F
 from quapy.functional import get_divergence
-from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
 from quapy.classification.svmperf import SVMperf
 from quapy.data import LabelledCollection
 from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric
@ -343,8 +341,8 @@ class CC(AggregativeCrispQuantifier):
    :param classifier: a sklearn's Estimator that generates a classifier
    """

-    def __init__(self, classifier: BaseEstimator):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None):
+        self.classifier = qp._get_classifier(classifier)

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
        """
@ -373,8 +371,8 @@ class PCC(AggregativeSoftQuantifier):
    :param classifier: a sklearn's Estimator that generates a classifier
    """

-    def __init__(self, classifier: BaseEstimator):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None):
+        self.classifier = qp._get_classifier(classifier)

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
        """
@ -436,14 +434,14 @@ class ACC(AggregativeCrispQuantifier):
    """
    def __init__(
            self,
-            classifier: BaseEstimator,
+            classifier: BaseEstimator=None,
            val_split=5,
            solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize',
            method: Literal['inversion', 'invariant-ratio'] = 'inversion',
            norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip',
            n_jobs=None,
    ):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.n_jobs = qp._get_njobs(n_jobs)
        self.solver = solver
@ -571,14 +569,14 @@ class PACC(AggregativeSoftQuantifier):
    """
    def __init__(
            self,
-            classifier: BaseEstimator,
+            classifier: BaseEstimator=None,
            val_split=5,
            solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize',
            method: Literal['inversion', 'invariant-ratio'] = 'inversion',
            norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip',
            n_jobs=None
    ):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.n_jobs = qp._get_njobs(n_jobs)
        self.solver = solver
@ -668,8 +666,8 @@ class EMQ(AggregativeSoftQuantifier):
    MAX_ITER = 1000
    EPSILON = 1e-4

-    def __init__(self, classifier: BaseEstimator, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None):
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.exact_train_prev = exact_train_prev
        self.recalib = recalib
@ -832,7 +830,7 @@ class BayesianCC(AggregativeCrispQuantifier):
    :param mcmc_seed: random seed for the MCMC sampler (default 0)
    """
    def __init__(self,
-                 classifier: BaseEstimator,
+                 classifier: BaseEstimator=None,
                 val_split: float = 0.75,
                 num_warmup: int = 500,
                 num_samples: int = 1_000,
@ -849,7 +847,7 @@ class BayesianCC(AggregativeCrispQuantifier):
        if _bayesian.DEPENDENCIES_INSTALLED is False:
            raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")

-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.num_warmup = num_warmup
        self.num_samples = num_samples
@ -919,8 +917,8 @@ class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
        validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
    """

-    def __init__(self, classifier: BaseEstimator, val_split=5):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@ -995,8 +993,8 @@ class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
    :param n_jobs: number of parallel workers.
    """

-    def __init__(self, classifier: BaseEstimator, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None):
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.tol = tol
        self.divergence = divergence
@ -1060,8 +1058,8 @@ class SMM(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
        validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
    """

-    def __init__(self, classifier: BaseEstimator, val_split=5):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
      
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@ -1109,9 +1107,9 @@ class DMy(AggregativeSoftQuantifier):
    :param n_jobs: number of parallel workers (default None)
    """

-    def __init__(self, classifier, val_split=5, nbins=8, divergence: Union[str, Callable]='HD',
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, nbins=8, divergence: Union[str, Callable]='HD',
                 cdf=False, search='optim_minimize', n_jobs=None):
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.nbins = nbins
        self.divergence = divergence
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -328,7 +328,7 @@ class GridSearchQ(BaseQuantifier):
            if self.raise_errors:
                raise exception
            else:
-                return ConfigStatus(params, status)
+                return ConfigStatus(params, status, msg=str(exception))

        try:
            with timeout(self.timeout):