adding environment variables for N_JOBS, and adding a default classifier (sklearn's logistic regression) for when the classifier is not specified in aggregative quantifiers

This commit is contained in:
Alejandro Moreo Fernandez 2024-05-30 10:53:53 +02:00
parent 9ad36ef008
commit ad11b86168
9 changed files with 108 additions and 77 deletions

View File

@ -1,10 +1,26 @@
Change Log 0.1.9
----------------
- [TODO] add LeQua2024
- [TODO] add njobs to env
- [TODO] add basic examples
- [TODO] add default classifier to env
- [TODO] add default classifier to env
- Added a default classifier for aggregative quantifiers, which now can be instantiated without specifying
the classifier. The default classifier can be accessed in qp.environ['DEFAULT_CLS'] and is assigned to
sklearn.linear_model.LogisticRegression(max_iter=3000). If the classifier is not specified, then a clone
of said classifier is returned. E.g.:
> pacc = PACC()
is equivalent to:
> pacc = PACC(classifier=LogisticRegression(max_iter=3000))
- Improved error loging in model selection. In v0.1.8 only Status.INVALID was reported; in v0.1.9 it is
now accompanied by a textual description of the error
- The number of parallel workers can now be set via an environment variable by running, e.g.:
> N_JOBS=10 python3 your_script.py
which has the same effect as writing the following code at the beginning of your_script.py:
> import quapy as qp
> qp.environ["N_JOBS"] = 10
- Some examples have been added to the ./examples/ dir, which now contains numbered examples from basics (0)
to advanced topics (higher numbers)
- Moved the wiki documents to the ./docs/ folder so that they become editable via PR for the community

View File

@ -33,9 +33,10 @@ import quapy.functional as F # <- this module has some functional utilities, li
print(f'training prevalence = {F.strprev(train.prevalence())}')
# let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier
classifier = LogisticRegression()
# classifier = LogisticRegression()
pacc = qp.method.aggregative.PACC(classifier)
# pacc = qp.method.aggregative.PACC(classifier)
pacc = qp.method.aggregative.PACC()
print(f'training {pacc}')
pacc.fit(train)

View File

@ -1,10 +1,7 @@
import quapy as qp
from method._kdey import KDEyML
from quapy.method.non_aggregative import DMx
from quapy.protocol import APP, UPP
from quapy.protocol import UPP
from quapy.method.aggregative import DMy
from sklearn.linear_model import LogisticRegression
from examples.comparing_gridsearch import OLD_GridSearchQ
import numpy as np
from time import time
@ -12,10 +9,15 @@ from time import time
In this example, we show how to perform model selection on a DistributionMatching quantifier.
"""
model = DMy(LogisticRegression())
model = DMy()
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n'
f'alternatively, you can set this variable within the script as:\n'
f'import quapy as qp\n'
f'qp.environ["N_JOBS"]=-1')
training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test

View File

@ -7,7 +7,7 @@ import numpy as np
from sklearn.linear_model import LogisticRegression
import quapy as qp
from quapy.method.aggregative import PACC, EMQ, KDEyML
from quapy.method.aggregative import PACC, EMQ
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from pathlib import Path
@ -52,6 +52,7 @@ def load_timings(result_path):
df = pd.read_csv(result_path+'.csv', sep='\t')
return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict()
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 500

View File

@ -1,15 +1,18 @@
"""QuaPy module for quantification"""
from sklearn.linear_model import LogisticRegression
from quapy.data import datasets
from . import error
from . import data
from . import functional
# from . import method
from . import method
from . import evaluation
from . import protocol
from . import plot
from . import util
from . import model_selection
from . import classification
import os
__version__ = '0.1.9'
@ -20,7 +23,8 @@ environ = {
'PAD_TOKEN': '[PAD]',
'PAD_INDEX': 1,
'SVMPERF_HOME': './svm_perf_quantification',
'N_JOBS': 1
'N_JOBS': int(os.getenv('N_JOBS', 1)),
'DEFAULT_CLS': LogisticRegression(max_iter=3000)
}
@ -48,3 +52,19 @@ def _get_sample_size(sample_size):
if sample_size is None:
raise ValueError('neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')
return sample_size
def _get_classifier(classifier):
"""
If `classifier` is None, then it returns `environ['DEFAULT_CLS']`;
if otherwise, returns `classifier`.
:param classifier: sklearn's estimator or None
:return: sklearn's estimator
"""
if classifier is None:
from sklearn.base import clone
classifier = clone(environ['DEFAULT_CLS'])
if classifier is None:
raise ValueError('neither classifier nor qp.environ["DEFAULT_CLS"] have been specified')
return classifier

View File

@ -24,12 +24,14 @@ class KDEBase:
Checks that the bandwidth parameter is correct
:param bandwidth: either a string (see BANDWIDTH_METHOD) or a float
:return: nothing, but raises an exception for invalid values
:return: the bandwidth if the check is passed, or raises an exception for invalid values
"""
assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
if isinstance(bandwidth, float):
assert 0 < bandwidth < 1, "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
assert 0 < bandwidth < 1, \
"the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
return bandwidth
def get_kde_function(self, X, bandwidth):
"""
@ -106,16 +108,13 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated.
:param bandwidth: float, the bandwidth of the Kernel
:param n_jobs: number of parallel workers
:param random_state: a seed to be set before fitting any base quantifier (default None)
"""
def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=None):
self._check_bandwidth(bandwidth)
self.classifier = classifier
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.bandwidth = bandwidth
self.n_jobs = n_jobs
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.random_state=random_state
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@ -130,7 +129,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
:param posteriors: instances in the sample converted into posterior probabilities
:return: a vector of class prevalence estimates
"""
np.random.RandomState(self.random_state)
with qp.util.temp_seed(self.random_state):
epsilon = 1e-10
n_classes = len(self.mix_densities)
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
@ -183,20 +182,17 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated.
:param bandwidth: float, the bandwidth of the Kernel
:param n_jobs: number of parallel workers
:param random_state: a seed to be set before fitting any base quantifier (default None)
:param montecarlo_trials: number of Monte Carlo trials (default 10000)
"""
def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD',
bandwidth=0.1, n_jobs=None, random_state=None, montecarlo_trials=10000):
def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD',
bandwidth=0.1, random_state=None, montecarlo_trials=10000):
self._check_bandwidth(bandwidth)
self.classifier = classifier
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.divergence = divergence
self.bandwidth = bandwidth
self.n_jobs = n_jobs
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.random_state=random_state
self.montecarlo_trials = montecarlo_trials
@ -278,15 +274,12 @@ class KDEyCS(AggregativeSoftQuantifier):
Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated.
:param bandwidth: float, the bandwidth of the Kernel
:param n_jobs: number of parallel workers
"""
def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None):
KDEBase._check_bandwidth(bandwidth)
self.classifier = classifier
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.bandwidth = bandwidth
self.n_jobs = n_jobs
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
def gram_matrix_mix_sum(self, X, Y=None):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
@ -355,7 +348,7 @@ class KDEyCS(AggregativeSoftQuantifier):
# called \overline{r} in the paper
alpha_ratio = alpha * self.counts_inv
# recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
# recall that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
partA = -np.log((alpha_ratio @ tr_te_sums) * Minv)
partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio)
return partA + partB #+ partC

View File

@ -27,8 +27,8 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=None, n_jobs=None):
self.classifier = classifier
def __init__(self, classifier: BaseEstimator=None, val_split=None, n_jobs=None):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.n_jobs = qp._get_njobs(n_jobs)
@ -143,7 +143,7 @@ class T50(ThresholdOptimization):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
def __init__(self, classifier: BaseEstimator=None, val_split=5):
super().__init__(classifier, val_split)
def condition(self, tpr, fpr) -> float:
@ -167,7 +167,7 @@ class MAX(ThresholdOptimization):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
def __init__(self, classifier: BaseEstimator=None, val_split=5):
super().__init__(classifier, val_split)
def condition(self, tpr, fpr) -> float:
@ -192,7 +192,7 @@ class X(ThresholdOptimization):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
def __init__(self, classifier: BaseEstimator=None, val_split=5):
super().__init__(classifier, val_split)
def condition(self, tpr, fpr) -> float:
@ -215,7 +215,7 @@ class MS(ThresholdOptimization):
`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
def __init__(self, classifier: BaseEstimator=None, val_split=5):
super().__init__(classifier, val_split)
def condition(self, tpr, fpr) -> float:
@ -254,7 +254,7 @@ class MS2(MS):
`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
def __init__(self, classifier: BaseEstimator=None, val_split=5):
super().__init__(classifier, val_split)
def discard(self, tpr, fpr) -> bool:

View File

@ -3,7 +3,6 @@ from copy import deepcopy
from typing import Callable, Literal, Union
import numpy as np
from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
from scipy import optimize
from sklearn.base import BaseEstimator
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
@ -12,7 +11,6 @@ from sklearn.model_selection import cross_val_predict
import quapy as qp
import quapy.functional as F
from quapy.functional import get_divergence
from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric
@ -343,8 +341,8 @@ class CC(AggregativeCrispQuantifier):
:param classifier: a sklearn's Estimator that generates a classifier
"""
def __init__(self, classifier: BaseEstimator):
self.classifier = classifier
def __init__(self, classifier: BaseEstimator=None):
self.classifier = qp._get_classifier(classifier)
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
"""
@ -373,8 +371,8 @@ class PCC(AggregativeSoftQuantifier):
:param classifier: a sklearn's Estimator that generates a classifier
"""
def __init__(self, classifier: BaseEstimator):
self.classifier = classifier
def __init__(self, classifier: BaseEstimator=None):
self.classifier = qp._get_classifier(classifier)
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
"""
@ -436,14 +434,14 @@ class ACC(AggregativeCrispQuantifier):
"""
def __init__(
self,
classifier: BaseEstimator,
classifier: BaseEstimator=None,
val_split=5,
solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize',
method: Literal['inversion', 'invariant-ratio'] = 'inversion',
norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip',
n_jobs=None,
):
self.classifier = classifier
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.n_jobs = qp._get_njobs(n_jobs)
self.solver = solver
@ -571,14 +569,14 @@ class PACC(AggregativeSoftQuantifier):
"""
def __init__(
self,
classifier: BaseEstimator,
classifier: BaseEstimator=None,
val_split=5,
solver: Literal['minimize', 'exact', 'exact-raise', 'exact-cc'] = 'minimize',
method: Literal['inversion', 'invariant-ratio'] = 'inversion',
norm: Literal['clip', 'mapsimplex', 'condsoftmax'] = 'clip',
n_jobs=None
):
self.classifier = classifier
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.n_jobs = qp._get_njobs(n_jobs)
self.solver = solver
@ -668,8 +666,8 @@ class EMQ(AggregativeSoftQuantifier):
MAX_ITER = 1000
EPSILON = 1e-4
def __init__(self, classifier: BaseEstimator, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None):
self.classifier = classifier
def __init__(self, classifier: BaseEstimator=None, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.exact_train_prev = exact_train_prev
self.recalib = recalib
@ -832,7 +830,7 @@ class BayesianCC(AggregativeCrispQuantifier):
:param mcmc_seed: random seed for the MCMC sampler (default 0)
"""
def __init__(self,
classifier: BaseEstimator,
classifier: BaseEstimator=None,
val_split: float = 0.75,
num_warmup: int = 500,
num_samples: int = 1_000,
@ -849,7 +847,7 @@ class BayesianCC(AggregativeCrispQuantifier):
if _bayesian.DEPENDENCIES_INSTALLED is False:
raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")
self.classifier = classifier
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.num_warmup = num_warmup
self.num_samples = num_samples
@ -919,8 +917,8 @@ class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
self.classifier = classifier
def __init__(self, classifier: BaseEstimator=None, val_split=5):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@ -995,8 +993,8 @@ class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
:param n_jobs: number of parallel workers.
"""
def __init__(self, classifier: BaseEstimator, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None):
self.classifier = classifier
def __init__(self, classifier: BaseEstimator=None, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.tol = tol
self.divergence = divergence
@ -1060,8 +1058,8 @@ class SMM(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
self.classifier = classifier
def __init__(self, classifier: BaseEstimator=None, val_split=5):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@ -1109,9 +1107,9 @@ class DMy(AggregativeSoftQuantifier):
:param n_jobs: number of parallel workers (default None)
"""
def __init__(self, classifier, val_split=5, nbins=8, divergence: Union[str, Callable]='HD',
def __init__(self, classifier: BaseEstimator=None, val_split=5, nbins=8, divergence: Union[str, Callable]='HD',
cdf=False, search='optim_minimize', n_jobs=None):
self.classifier = classifier
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.nbins = nbins
self.divergence = divergence

View File

@ -328,7 +328,7 @@ class GridSearchQ(BaseQuantifier):
if self.raise_errors:
raise exception
else:
return ConfigStatus(params, status)
return ConfigStatus(params, status, msg=str(exception))
try:
with timeout(self.timeout):