refactoring aggregation methods

This commit is contained in:
Alejandro Moreo Fernandez 2024-01-25 14:33:41 +01:00
parent efe385318f
commit 7ac834bd2c
9 changed files with 178 additions and 373 deletions

View File

@ -13,7 +13,7 @@ for facilitating the analysis and interpretation of the experimental results.
### Last updates: ### Last updates:
* Version 0.1.7 is released! major changes can be consulted [here](quapy/CHANGE_LOG.txt). * Version 0.1.8 is released! major changes can be consulted [here](quapy/CHANGE_LOG.txt).
* A detailed documentation is now available [here](https://hlt-isti.github.io/QuaPy/) * A detailed documentation is now available [here](https://hlt-isti.github.io/QuaPy/)
* The developer API documentation is available [here](https://hlt-isti.github.io/QuaPy/build/html/modules.html) * The developer API documentation is available [here](https://hlt-isti.github.io/QuaPy/build/html/modules.html)
@ -96,6 +96,9 @@ quantification methods based on structured output learning, HDy, QuaNet, quantif
* pandas, xlrd * pandas, xlrd
* matplotlib * matplotlib
## Contributing
In case you want to contribute improvements to quapy, please generate pull request to the "devel" branch.
## Documentation ## Documentation

View File

@ -1,5 +1,5 @@
import quapy as qp import quapy as qp
from method.kdey import KDEyML from method._kdey import KDEyML
from quapy.method.non_aggregative import DMx from quapy.method.non_aggregative import DMx
from quapy.protocol import APP, UPP from quapy.protocol import APP, UPP
from quapy.method.aggregative import DMy from quapy.method.aggregative import DMy

View File

@ -4,15 +4,20 @@ Change Log 0.1.8
- Fixed ThresholdOptimization methods (X, T50, MAX, MS and MS2). Thanks to Tobias Schumacher and colleagues for pointing - Fixed ThresholdOptimization methods (X, T50, MAX, MS and MS2). Thanks to Tobias Schumacher and colleagues for pointing
this out in Appendix A of "Schumacher, T., Strohmaier, M., & Lemmerich, F. (2021). A comparative evaluation of this out in Appendix A of "Schumacher, T., Strohmaier, M., & Lemmerich, F. (2021). A comparative evaluation of
quantification methods. arXiv:2103.03223v3 [cs.LG]" quantification methods. arXiv:2103.03223v3 [cs.LG]"
- Added HDx and DistributionMatchingX to non-aggregative quantifiers (see also the new example "comparing_HDy_HDx.py") - Added HDx and DistributionMatchingX to non-aggregative quantifiers (see also the new example "comparing_HDy_HDx.py")
- New UCI multiclass datasets added (thanks to Pablo González). The 5 UCI multiclass datasets are those corresponding - New UCI multiclass datasets added (thanks to Pablo González). The 5 UCI multiclass datasets are those corresponding
to the following criteria: to the following criteria:
- >1000 instances - >1000 instances
- >2 classes - >2 classes
- classification datasets - classification datasets
- Python API available - Python API available
- New IFCB (plankton) dataset added. See fetch_IFCB. - New IFCB (plankton) dataset added. See fetch_IFCB.
- Added new evaluation measures NAE, NRAE - Added new evaluation measures NAE, NRAE
- Added new meta method "MedianEstimator"; an ensemble of binary base quantifiers that receives as input a dictionary - Added new meta method "MedianEstimator"; an ensemble of binary base quantifiers that receives as input a dictionary
of hyperparameters that will explore exhaustively, fitting and generating predictions for each combination of of hyperparameters that will explore exhaustively, fitting and generating predictions for each combination of
hyperparameters, and that returns, as the prevalence estimates, the median across all predictions. hyperparameters, and that returns, as the prevalence estimates, the median across all predictions.

View File

@ -17,6 +17,9 @@ AGGREGATIVE_METHODS = {
aggregative.MAX, aggregative.MAX,
aggregative.MS, aggregative.MS,
aggregative.MS2, aggregative.MS2,
aggregative.KDEyML,
aggregative.KDEyCS,
aggregative.KDEyHD,
} }

View File

@ -12,6 +12,9 @@ from sklearn.metrics.pairwise import rbf_kernel
class KDEBase: class KDEBase:
"""
Common ancestor for KDE-based methods. Implements some common routines.
"""
BANDWIDTH_METHOD = ['scott', 'silverman'] BANDWIDTH_METHOD = ['scott', 'silverman']
@ -156,7 +159,6 @@ class KDEyCS(AggregativeSoftQuantifier):
assert all(sorted(np.unique(y)) == np.arange(n)), \ assert all(sorted(np.unique(y)) == np.arange(n)), \
'label name gaps not allowed in current implementation' 'label name gaps not allowed in current implementation'
# counts_inv keeps track of the relative weight of each datapoint within its class # counts_inv keeps track of the relative weight of each datapoint within its class
# (i.e., the weight in its KDE model) # (i.e., the weight in its KDE model)
counts_inv = 1 / (data.counts()) counts_inv = 1 / (data.counts())
@ -190,7 +192,6 @@ class KDEyCS(AggregativeSoftQuantifier):
Minv = (1/M) # t in the paper Minv = (1/M) # t in the paper
n = Ptr.shape[1] n = Ptr.shape[1]
# becomes a constant that does not affect the optimization, no need to compute it # becomes a constant that does not affect the optimization, no need to compute it
# partC = 0.5*np.log(self.gram_matrix_mix_sum(Pte) * Kinv * Kinv) # partC = 0.5*np.log(self.gram_matrix_mix_sum(Pte) * Kinv * Kinv)

View File

@ -50,6 +50,16 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
'model selection. Rather pass the LabelledCollection at fit time') 'model selection. Rather pass the LabelledCollection at fit time')
self.val_split_ = val_split self.val_split_ = val_split
def _check_init_parameters(self):
"""
Implements any check to be performed in the parameters of the init method before undertaking
the training of the quantifier. This is made as to allow for a quick execution stop when the
parameters are not valid.
:return: Nothing. May raise an exception.
"""
pass
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None): def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
""" """
Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function. Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function.
@ -59,6 +69,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
learner has been trained outside the quantifier. learner has been trained outside the quantifier.
:return: self :return: self
""" """
self._check_init_parameters()
classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on=val_split) classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on=val_split)
self.aggregation_fit(classif_predictions, data) self.aggregation_fit(classif_predictions, data)
return self return self
@ -113,8 +124,9 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
raise ValueError(f'invalid value {predict_on} in fit. ' raise ValueError(f'invalid value {predict_on} in fit. '
f'Specify a integer >1 for kFCV estimation.') f'Specify a integer >1 for kFCV estimation.')
else: else:
n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None)
predictions = cross_val_predict( predictions = cross_val_predict(
self.classifier, *data.Xy, cv=predict_on, n_jobs=self.n_jobs, method=self._classifier_method()) self.classifier, *data.Xy, cv=predict_on, n_jobs=n_jobs, method=self._classifier_method())
predictions = LabelledCollection(predictions, data.y, classes=data.classes_) predictions = LabelledCollection(predictions, data.y, classes=data.classes_)
self.classifier.fit(*data.Xy) self.classifier.fit(*data.Xy)
else: else:
@ -291,8 +303,6 @@ class BinaryAggregativeQuantifier(AggregativeQuantifier, BinaryQuantifier):
return super().fit(data, fit_classifier, val_split) return super().fit(data, fit_classifier, val_split)
# Methods # Methods
# ------------------------------------ # ------------------------------------
class CC(AggregativeCrispQuantifier): class CC(AggregativeCrispQuantifier):
@ -333,18 +343,28 @@ class ACC(AggregativeCrispQuantifier):
:param classifier: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: specifies the data used for generating classifier predictions. This specification :param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set (default 0.4); or as an integer, indicating that the predictions be extracted from the training set; or as an integer (default 5), indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a collection defining the specific set of data to use for validation. for `k`); or as a collection defining the specific set of data to use for validation.
Alternatively, this set can be specified at fit time by indicating the exact set of data Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated. on which the predictions are to be generated.
:param n_jobs: number of parallel workers :param n_jobs: number of parallel workers
:param solver: indicates the method to be used for obtaining the final esimates. The default choice
is 'exact', which comes down to solving the system of linear equations `Ax=B` where `A` is a
matrix containing the class-conditional probabilities of the predictions (e.g., the tpr and fpr in
binary) and `B` is the vector of prevalence values estimated via CC, as $x=A^{-1}B$. This solution
might not exist for degenerated classifiers, in which case the method defaults to classify and count
(i.e., does not attempt any adjustment).
Another option is to search for the prevalence vector that minimizes the loss |Ax-B|. The latter is
achieved by indicating solver='minimize'.
""" """
def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None): def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None, solver='exact'):
self.classifier = classifier self.classifier = classifier
self.val_split = val_split self.val_split = val_split
self.n_jobs = qp._get_njobs(n_jobs) self.n_jobs = qp._get_njobs(n_jobs)
assert solver in ['exact', 'minimize'], "unknown solver; valid ones are 'exact', 'minimize'"
self.solver = solver
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
""" """
@ -358,7 +378,7 @@ class ACC(AggregativeCrispQuantifier):
@classmethod @classmethod
def getPteCondEstim(cls, classes, y, y_): def getPteCondEstim(cls, classes, y, y_):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # estimate the matrix with entry (i,j) being the estimate of P(hat_yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi # document that belongs to yj ends up being classified as belonging to yi
conf = confusion_matrix(y, y_, labels=classes).T conf = confusion_matrix(y, y_, labels=classes).T
conf = conf.astype(float) conf = conf.astype(float)
@ -372,10 +392,10 @@ class ACC(AggregativeCrispQuantifier):
def aggregate(self, classif_predictions): def aggregate(self, classif_predictions):
prevs_estim = self.cc.aggregate(classif_predictions) prevs_estim = self.cc.aggregate(classif_predictions)
return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim) return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim, solver=self.solver)
@classmethod @classmethod
def solve_adjustment(cls, PteCondEstim, prevs_estim): def solve_adjustment(cls, PteCondEstim, prevs_estim, solver='exact'):
""" """
Solves the system linear system :math:`Ax = B` with :math:`A` = `PteCondEstim` and :math:`B` = `prevs_estim` Solves the system linear system :math:`Ax = B` with :math:`A` = `PteCondEstim` and :math:`B` = `prevs_estim`
@ -383,16 +403,24 @@ class ACC(AggregativeCrispQuantifier):
of :math:`P(y_i|y_j)`, that is, the probability that an instance that belongs to :math:`y_j` ends up being of :math:`P(y_i|y_j)`, that is, the probability that an instance that belongs to :math:`y_j` ends up being
classified as belonging to :math:`y_i` classified as belonging to :math:`y_i`
:param prevs_estim: a `np.ndarray` of shape `(n_classes,)` with the class prevalence estimates :param prevs_estim: a `np.ndarray` of shape `(n_classes,)` with the class prevalence estimates
:param solver: indicates the method to use for solving the system of linear equations. Valid options are
'exact' (tries to solve the system --may fail if the misclassificatin matrix has rank < n_classes) or
'optim_minimize' (minimizes a norm --always exists).
:return: an adjusted `np.ndarray` of shape `(n_classes,)` with the corrected class prevalence estimates :return: an adjusted `np.ndarray` of shape `(n_classes,)` with the corrected class prevalence estimates
""" """
A = PteCondEstim A = PteCondEstim
B = prevs_estim B = prevs_estim
if solver == 'exact':
try: try:
adjusted_prevs = np.linalg.solve(A, B) adjusted_prevs = np.linalg.solve(A, B)
adjusted_prevs = np.clip(adjusted_prevs, 0, 1) adjusted_prevs = np.clip(adjusted_prevs, 0, 1)
adjusted_prevs /= adjusted_prevs.sum() adjusted_prevs /= adjusted_prevs.sum()
except np.linalg.LinAlgError: except np.linalg.LinAlgError:
adjusted_prevs = prevs_estim # no way to adjust them! adjusted_prevs = prevs_estim # no way to adjust them!
elif solver == 'minimize':
def loss(prev):
return np.linalg.norm(A@prev - B)
return F.optim_minimize(loss, n_classes=A.shape[0])
return adjusted_prevs return adjusted_prevs
@ -427,7 +455,7 @@ class PACC(AggregativeSoftQuantifier):
:param classifier: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: specifies the data used for generating classifier predictions. This specification :param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set (default 0.4); or as an integer, indicating that the predictions be extracted from the training set; or as an integer (default 5), indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`). Alternatively, this set can be specified at fit time by indicating the exact set of data for `k`). Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated. on which the predictions are to be generated.
@ -455,7 +483,7 @@ class PACC(AggregativeSoftQuantifier):
@classmethod @classmethod
def getPteCondEstim(cls, classes, y, y_): def getPteCondEstim(cls, classes, y, y_):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # estimate the matrix with entry (i,j) being the estimate of P(hat_yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi # document that belongs to yj ends up being classified as belonging to yi
n_classes = len(classes) n_classes = len(classes)
confusion = np.eye(n_classes) confusion = np.eye(n_classes)
@ -475,17 +503,100 @@ class EMQ(AggregativeSoftQuantifier):
probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via
maximum-likelihood estimation, in a mutually recursive way, until convergence. maximum-likelihood estimation, in a mutually recursive way, until convergence.
This implementation also gives access to the heuristics proposed by `Alexandari et al. paper
<http://proceedings.mlr.press/v119/alexandari20a.html>`_. These heuristics consist of using, as the training
prevalence, an estimate of it obtained via k-fold cross validation (instead of the true training prevalence),
and to recalibrate the posterior probabilities of the classifier.
:param classifier: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set; or as an integer, indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`, default 5); or as a collection defining the specific set of data to use for validation.
Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated. This hyperparameter is only meant to be used when the
heuristics are to be applied, i.e., if a recalibration is required. The default value is None (meaning
the recalibration is not required). In case this hyperparameter is set to a value other than None, but
the recalibration is not required (recalib=None), a warning message will be raised.
:param exact_train_prev: set to True (default) for using the true training prevalence as the initial observation;
set to False for computing the training prevalence as an estimate of it, i.e., as the expected
value of the posterior probabilities of the training instances.
:param recalib: a string indicating the method of recalibration.
Available choices include "nbvs" (No-Bias Vector Scaling), "bcts" (Bias-Corrected Temperature Scaling,
default), "ts" (Temperature Scaling), and "vs" (Vector Scaling). Default is None (no recalibration).
:param n_jobs: number of parallel workers. Only used for recalibrating the classifier if `val_split` is set to
an integer `k` --the number of folds.
""" """
MAX_ITER = 1000 MAX_ITER = 1000
EPSILON = 1e-4 EPSILON = 1e-4
def __init__(self, classifier: BaseEstimator): def __init__(self, classifier: BaseEstimator, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None):
self.classifier = classifier self.classifier = classifier
self.val_split = val_split
self.exact_train_prev = exact_train_prev
self.recalib = recalib
self.n_jobs = n_jobs
@classmethod
def EMQ_BCTS(cls, classifier: BaseEstimator, n_jobs=None):
"""
Constructs an instance of EMQ using the best configuration found in the `Alexandari et al. paper
<http://proceedings.mlr.press/v119/alexandari20a.html>`_, i.e., one that relies on Bias-Corrected Temperature
Scaling (BCTS) as a recalibration function, and that uses an estimate of the training prevalence instead of
the true training prevalence.
:param classifier: a sklearn's Estimator that generates a classifier
:param n_jobs: number of parallel workers.
:return: An instance of EMQ with BCTS
"""
return EMQ(classifier, val_split=5, exact_train_prev=False, recalib='bcts', n_jobs=n_jobs)
def _check_init_parameters(self):
if self.val_split is not None:
if self.exact_train_prev and self.recalib is None:
raise RuntimeWarning(f'The parameter {self.val_split=} was specified for EMQ, while the parameters '
f'{self.exact_train_prev=} and {self.recalib=}. This has no effect and causes an unnecessary '
f'overload.')
def classify(self, instances):
"""
Provides the posterior probabilities for the given instances. If the classifier was required
to be recalibrated, then these posteriors are recalibrated accordingly.
:param instances: array-like of shape `(n_instances, n_dimensions,)`
:return: np.ndarray of shape `(n_instances, n_classes,)` with posterior probabilities
"""
posteriors = self.classifier.predict_proba(instances)
if hasattr(self, 'calibration_function') and self.calibration_function is not None:
posteriors = self.calibration_function(posteriors)
return posteriors
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
if self.recalib is not None:
P, y = classif_predictions.Xy
if self.recalib == 'nbvs':
calibrator = NoBiasVectorScaling()
elif self.recalib == 'bcts':
calibrator = TempScaling(bias_positions='all')
elif self.recalib == 'ts':
calibrator = TempScaling()
elif self.recalib == 'vs':
calibrator = VectorScaling()
else:
raise ValueError('invalid param argument for recalibration method; available ones are '
'"nbvs", "bcts", "ts", and "vs".')
self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
if self.exact_train_prev:
self.train_prevalence = data.prevalence() self.train_prevalence = data.prevalence()
else:
train_posteriors = classif_predictions.X
if self.recalib is not None:
train_posteriors = self.calibration_function(train_posteriors)
self.train_prevalence = F.prevalence_from_probabilities(train_posteriors)
def aggregate(self, classif_posteriors, epsilon=EPSILON): def aggregate(self, classif_posteriors, epsilon=EPSILON):
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon) priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
@ -542,93 +653,6 @@ class EMQ(AggregativeSoftQuantifier):
return qs, ps return qs, ps
class EMQrecalib(AggregativeSoftQuantifier):
"""
`Expectation Maximization for Quantification <https://ieeexplore.ieee.org/abstract/document/6789744>`_ (EMQ),
aka `Saerens-Latinne-Decaestecker` (SLD) algorithm, with the heuristics proposed by
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_.
These heuristics consist of using, as the training prevalence, an estimate of it obtained via k-fold cross
validation (instead of the true training prevalence), and to recalibrate the posterior probabilities of
the classifier.
:param classifier: a sklearn's Estimator that generates a classifier
:param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set (default 0.4); or as an integer, indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`, default 5); or as a collection defining the specific set of data to use for validation.
Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated.
:param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence;
or set to False for computing the training prevalence as an estimate of it, i.e., as the expected
value of the posterior probabilities of the training instances
:param recalib: a string indicating the method of recalibration.
Available choices include "nbvs" (No-Bias Vector Scaling), "bcts" (Bias-Corrected Temperature Scaling,
default), "ts" (Temperature Scaling), and "vs" (Vector Scaling).
:param n_jobs: number of parallel workers
"""
MAX_ITER = 1000
EPSILON = 1e-4
def __init__(self, classifier: BaseEstimator, val_split=5, exact_train_prev=False, recalib='bcts', n_jobs=None):
self.classifier = classifier
self.val_split = val_split
self.exact_train_prev = exact_train_prev
self.recalib = recalib
self.n_jobs = n_jobs
def classify(self, instances):
"""
Provides the posterior probabilities for the given instances. If the classifier is
recalibrated, then these posteriors will be recalibrated accordingly.
:param instances: array-like of shape `(n_instances, n_dimensions,)`
:return: np.ndarray of shape `(n_instances, n_classes,)` with posterior probabilities
"""
posteriors = self.classifier.predict_proba(instances)
if hasattr(self, 'calibration_function') and self.calibration_function is not None:
posteriors = self.calibration_function(posteriors)
return posteriors
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
if self.recalib is not None:
P, y = classif_predictions.Xy
if self.recalib == 'nbvs':
calibrator = NoBiasVectorScaling()
elif self.recalib == 'bcts':
calibrator = TempScaling(bias_positions='all')
elif self.recalib == 'ts':
calibrator = TempScaling()
elif self.recalib == 'vs':
calibrator = VectorScaling()
else:
raise ValueError('invalid param argument for recalibration method; available ones are '
'"nbvs", "bcts", "ts", and "vs".')
self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
if self.exact_train_prev:
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
else:
if self.recalib is not None:
train_posteriors = self.classify(data.X)
else:
train_posteriors = classif_predictions.X
self.train_prevalence = np.mean(train_posteriors, axis=0)
def aggregate(self, classif_posteriors, epsilon=EPSILON):
priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon)
return priors
def predict_proba(self, instances, epsilon=EPSILON):
classif_posteriors = self.classify(instances)
priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon)
return posteriors
class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
""" """
`Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy). `Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy).
@ -722,14 +746,16 @@ class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
:param divergence: a str indicating the name of divergence (currently supported ones are "HD" or "topsoe"), or a :param divergence: a str indicating the name of divergence (currently supported ones are "HD" or "topsoe"), or a
callable function computes the divergence between two distributions (two equally sized arrays). callable function computes the divergence between two distributions (two equally sized arrays).
:param tol: a float with the tolerance for the ternary search algorithm. :param tol: a float with the tolerance for the ternary search algorithm.
:param n_jobs: number of parallel workers.
""" """
def __init__(self, classifier: BaseEstimator, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05): def __init__(self, classifier: BaseEstimator, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None):
self.classifier = classifier self.classifier = classifier
self.val_split = val_split self.val_split = val_split
self.tol = tol self.tol = tol
self.divergence = divergence self.divergence = divergence
self.n_bins = n_bins self.n_bins = n_bins
self.n_jobs = n_jobs
def _ternary_search(self, f, left, right, tol): def _ternary_search(self, f, left, right, tol):
""" """
@ -1058,259 +1084,6 @@ def newSVMRAE(svmperf_base=None, C=1):
return newELM(svmperf_base, loss='mrae', C=C) return newELM(svmperf_base, loss='mrae', C=C)
class ThresholdOptimization(BinaryAggregativeQuantifier):
"""
Abstract class of Threshold Optimization variants for :class:`ACC` as proposed by
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_.
The goal is to bring improved stability to the denominator of the adjustment.
The different variants are based on different heuristics for choosing a decision threshold
that would allow for more true positives and many more false positives, on the grounds this
would deliver larger denominators.
:param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None):
self.classifier = classifier
self.val_split = val_split
self.n_jobs = qp._get_njobs(n_jobs)
@abstractmethod
def condition(self, tpr, fpr) -> float:
"""
Implements the criterion according to which the threshold should be selected.
This function should return the (float) score to be minimized.
:param tpr: float, true positive rate
:param fpr: float, false positive rate
:return: float, a score for the given `tpr` and `fpr`
"""
...
def discard(self, tpr, fpr) -> bool:
"""
Indicates whether a combination of tpr and fpr should be discarded
:param tpr: float, true positive rate
:param fpr: float, false positive rate
:return: true if the combination is to be discarded, false otherwise
"""
return (tpr - fpr) == 0
def _eval_candidate_thresholds(self, decision_scores, y):
"""
Seeks for the best `tpr` and `fpr` according to the score obtained at different
decision thresholds. The scoring function is implemented in function `_condition`.
:param decision_scores: array-like with the classification scores
:param y: predicted labels for the validation set (or for the training set via `k`-fold cross validation)
:return: best `tpr` and `fpr` and `threshold` according to `_condition`
"""
candidate_thresholds = np.unique(decision_scores)
candidates = []
scores = []
for candidate_threshold in candidate_thresholds:
y_ = self.classes_[1 * (decision_scores >= candidate_threshold)]
TP, FP, FN, TN = self._compute_table(y, y_)
tpr = self._compute_tpr(TP, FN)
fpr = self._compute_fpr(FP, TN)
if not self.discard(tpr, fpr):
candidate_score = self.condition(tpr, fpr)
candidates.append([tpr, fpr, candidate_threshold])
scores.append(candidate_score)
if len(candidates) == 0:
# if no candidate gives rise to a valid combination of tpr and fpr, this method defaults to the standard
# classify & count; this is akin to assign tpr=1, fpr=0, threshold=0
tpr, fpr, threshold = 1, 0, 0
candidates.append([tpr, fpr, threshold])
scores.append(0)
candidates = np.asarray(candidates)
candidates = candidates[np.argsort(scores)] # sort candidates by candidate_score
return candidates
def aggregate_with_threshold(self, classif_predictions, tprs, fprs, thresholds):
# This function performs the adjusted count for given tpr, fpr, and threshold.
# Note that, due to broadcasting, tprs, fprs, and thresholds could be arrays of length > 1
prevs_estims = np.mean(classif_predictions[:, None] >= thresholds, axis=0)
prevs_estims = (prevs_estims - fprs) / (tprs - fprs)
prevs_estims = F.as_binary_prevalence(prevs_estims, clip_if_necessary=True)
return prevs_estims.squeeze()
def _compute_table(self, y, y_):
TP = np.logical_and(y == y_, y == self.pos_label).sum()
FP = np.logical_and(y != y_, y == self.neg_label).sum()
FN = np.logical_and(y != y_, y == self.pos_label).sum()
TN = np.logical_and(y == y_, y == self.neg_label).sum()
return TP, FP, FN, TN
def _compute_tpr(self, TP, FP):
if TP + FP == 0:
return 1
return TP / (TP + FP)
def _compute_fpr(self, FP, TN):
if FP + TN == 0:
return 0
return FP / (FP + TN)
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
decision_scores, y = classif_predictions.Xy
# the standard behavior is to keep the best threshold only
self.tpr, self.fpr, self.threshold = self._eval_candidate_thresholds(decision_scores, y)[0]
return self
def aggregate(self, classif_predictions: np.ndarray):
# the standard behavior is to compute the adjusted count using the best threshold found
return self.aggregate_with_threshold(classif_predictions, self.tpr, self.fpr, self.threshold)
class T50(ThresholdOptimization):
"""
Threshold Optimization variant for :class:`ACC` as proposed by
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
for the threshold that makes `tpr` closest to 0.5.
The goal is to bring improved stability to the denominator of the adjustment.
:param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
super().__init__(classifier, val_split)
def condition(self, tpr, fpr) -> float:
return abs(tpr - 0.5)
class MAX(ThresholdOptimization):
"""
Threshold Optimization variant for :class:`ACC` as proposed by
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
for the threshold that maximizes `tpr-fpr`.
The goal is to bring improved stability to the denominator of the adjustment.
:param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
super().__init__(classifier, val_split)
def condition(self, tpr, fpr) -> float:
# MAX strives to maximize (tpr - fpr), which is equivalent to minimize (fpr - tpr)
return (fpr - tpr)
class X(ThresholdOptimization):
"""
Threshold Optimization variant for :class:`ACC` as proposed by
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
for the threshold that yields `tpr=1-fpr`.
The goal is to bring improved stability to the denominator of the adjustment.
:param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
super().__init__(classifier, val_split)
def condition(self, tpr, fpr) -> float:
return abs(1 - (tpr + fpr))
class MS(ThresholdOptimization):
"""
Median Sweep. Threshold Optimization variant for :class:`ACC` as proposed by
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that generates
class prevalence estimates for all decision thresholds and returns the median of them all.
The goal is to bring improved stability to the denominator of the adjustment.
:param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
super().__init__(classifier, val_split)
def condition(self, tpr, fpr) -> float:
return 1
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
decision_scores, y = classif_predictions.Xy
# keeps all candidates
tprs_fprs_thresholds = self._eval_candidate_thresholds(decision_scores, y)
self.tprs = tprs_fprs_thresholds[:, 0]
self.fprs = tprs_fprs_thresholds[:, 1]
self.thresholds = tprs_fprs_thresholds[:, 2]
return self
def aggregate(self, classif_predictions: np.ndarray):
prevalences = self.aggregate_with_threshold(classif_predictions, self.tprs, self.fprs, self.thresholds)
if prevalences.ndim==2:
prevalences = np.median(prevalences, axis=0)
return prevalences
class MS2(MS):
"""
Median Sweep 2. Threshold Optimization variant for :class:`ACC` as proposed by
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that generates
class prevalence estimates for all decision thresholds and returns the median of for cases in
which `tpr-fpr>0.25`
The goal is to bring improved stability to the denominator of the adjustment.
:param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
validation data, or as an integer, indicating that the misclassification rates should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
super().__init__(classifier, val_split)
def discard(self, tpr, fpr) -> bool:
return (tpr-fpr) <= 0.25
class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier): class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
""" """
Allows any binary quantifier to perform quantification on single-label datasets. Allows any binary quantifier to perform quantification on single-label datasets.
@ -1476,6 +1249,26 @@ class AggregativeMedianEstimator(BinaryQuantifier):
) )
return np.median(prev_preds, axis=0) return np.median(prev_preds, axis=0)
#---------------------------------------------------------------
# imports
#---------------------------------------------------------------
from . import _threshold_optim
T50 = _threshold_optim.T50
MAX = _threshold_optim.MAX
X = _threshold_optim.X
MS = _threshold_optim.MS
MS2 = _threshold_optim.MS2
from . import _kdey
KDEyML = _kdey.KDEyML
KDEyHD = _kdey.KDEyHD
KDEyCS = _kdey.KDEyCS
#--------------------------------------------------------------- #---------------------------------------------------------------
# aliases # aliases
#--------------------------------------------------------------- #---------------------------------------------------------------

View File

@ -15,13 +15,13 @@ from quapy.method.base import BaseQuantifier, BinaryQuantifier
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ, AggregativeQuantifier from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ, AggregativeQuantifier
try: try:
from . import neural from . import _neural
except ModuleNotFoundError: except ModuleNotFoundError:
neural = None _neural = None
if neural: if _neural:
QuaNet = neural.QuaNetTrainer QuaNet = _neural.QuaNetTrainer
else: else:
QuaNet = "QuaNet is not available due to missing torch package" QuaNet = "QuaNet is not available due to missing torch package"

View File

@ -3,6 +3,7 @@ import pytest
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
import method.aggregative
import quapy as qp import quapy as qp
from quapy.model_selection import GridSearchQ from quapy.model_selection import GridSearchQ
from quapy.method.base import BinaryQuantifier from quapy.method.base import BinaryQuantifier
@ -13,8 +14,8 @@ from quapy.protocol import APP
from quapy.method.aggregative import DMy from quapy.method.aggregative import DMy
from quapy.method.meta import MedianEstimator from quapy.method.meta import MedianEstimator
datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'), # datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'),
pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')] # pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')]
tinydatasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True).reduce(), id='tiny_hcr'), tinydatasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True).reduce(), id='tiny_hcr'),
pytest.param(qp.datasets.fetch_UCIDataset('ionosphere').reduce(), id='tiny_ionosphere')] pytest.param(qp.datasets.fetch_UCIDataset('ionosphere').reduce(), id='tiny_ionosphere')]
@ -22,7 +23,7 @@ tinydatasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True).reduc
learners = [LogisticRegression, LinearSVC] learners = [LogisticRegression, LinearSVC]
@pytest.mark.parametrize('dataset', datasets) @pytest.mark.parametrize('dataset', tinydatasets)
@pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS) @pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS)
@pytest.mark.parametrize('learner', learners) @pytest.mark.parametrize('learner', learners)
def test_aggregative_methods(dataset: Dataset, aggregative_method, learner): def test_aggregative_methods(dataset: Dataset, aggregative_method, learner):
@ -42,7 +43,7 @@ def test_aggregative_methods(dataset: Dataset, aggregative_method, learner):
assert type(error) == np.float64 assert type(error) == np.float64
@pytest.mark.parametrize('dataset', datasets) @pytest.mark.parametrize('dataset', tinydatasets)
@pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS) @pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS)
def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method): def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
model = non_aggregative_method() model = non_aggregative_method()
@ -61,7 +62,7 @@ def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
assert type(error) == np.float64 assert type(error) == np.float64
@pytest.mark.parametrize('base_method', AGGREGATIVE_METHODS) @pytest.mark.parametrize('base_method', [method.aggregative.ACC, method.aggregative.PACC])
@pytest.mark.parametrize('learner', [LogisticRegression]) @pytest.mark.parametrize('learner', [LogisticRegression])
@pytest.mark.parametrize('dataset', tinydatasets) @pytest.mark.parametrize('dataset', tinydatasets)
@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES) @pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES)
@ -93,7 +94,6 @@ def test_quanet_method():
print('skipping QuaNet test due to missing torch package') print('skipping QuaNet test due to missing torch package')
return return
qp.environ['SAMPLE_SIZE'] = 100 qp.environ['SAMPLE_SIZE'] = 100
# load the kindle dataset as text, and convert words to numerical indexes # load the kindle dataset as text, and convert words to numerical indexes