bayesian cc now inherits from the new abstract class WithConfidenceABC, just like AggregativeBootstrap

This commit is contained in:
Alejandro Moreo Fernandez 2024-11-29 18:15:09 +01:00
parent a0c84c5510
commit 2728dfbaa6
6 changed files with 158 additions and 125 deletions

View File

@ -1,6 +1,5 @@
- [TODO] adapt BayesianCC to WithConfidence interface - [TODO] document confidence in manuals
- [TODO] document confidence - [TODO] Test the return_type="index" in protocols and finish the "distributing_samples.py" example
- [TODO] Test the return_type="index" in protocols and finish the "distributin_samples.py" example
- [TODO] Add EDy (an implementation is available at quantificationlib) - [TODO] Add EDy (an implementation is available at quantificationlib)
- [TODO] add ensemble methods SC-MQ, MC-SQ, MC-MQ - [TODO] add ensemble methods SC-MQ, MC-SQ, MC-MQ
- [TODO] add HistNetQ - [TODO] add HistNetQ

View File

@ -29,7 +29,8 @@ import quapy as qp
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from quapy.method.aggregative import BayesianCC, ACC, PACC from quapy.method.aggregative import ACC, PACC
from method.confidence import BayesianCC
from quapy.data import LabelledCollection, Dataset from quapy.data import LabelledCollection, Dataset

View File

@ -1,3 +1,4 @@
from quapy.method.confidence import BayesianCC
from quapy.method.confidence import AggregativeBootstrap from quapy.method.confidence import AggregativeBootstrap
from quapy.method.aggregative import PACC from quapy.method.aggregative import PACC
import quapy.functional as F import quapy.functional as F
@ -23,7 +24,8 @@ train, test = data.train_test
# by simply wrapping an aggregative quantifier within the AggregativeBootstrap class, we can obtain confidence # by simply wrapping an aggregative quantifier within the AggregativeBootstrap class, we can obtain confidence
# intervals around the point estimate, in this case, at 95% of confidence # intervals around the point estimate, in this case, at 95% of confidence
pacc = AggregativeBootstrap(PACC(), confidence_level=0.95) pacc = AggregativeBootstrap(PACC(), n_test_samples=500, confidence_level=0.95)
with qp.util.temp_seed(0): with qp.util.temp_seed(0):
# we train the quantifier the usual way # we train the quantifier the usual way
@ -73,6 +75,8 @@ There are different ways for constructing confidence regions implemented in QuaP
- confidence ellipse in the Centered-Log Ratio (CLR) space: creates an ellipse in the CLR space (this should be - confidence ellipse in the Centered-Log Ratio (CLR) space: creates an ellipse in the CLR space (this should be
convenient for taking into account the inner structure of the probability simplex) convenient for taking into account the inner structure of the probability simplex)
use: AggregativeBootstrap(PACC(), confidence_level=0.95, method='ellipse-clr') use: AggregativeBootstrap(PACC(), confidence_level=0.95, method='ellipse-clr')
Other methods that return confidence regions in QuaPy include the BayesianCC method.
""" """

View File

@ -1,3 +1,4 @@
import method.confidence
from . import base from . import base
from . import aggregative from . import aggregative
from . import non_aggregative from . import non_aggregative
@ -22,7 +23,7 @@ AGGREGATIVE_METHODS = {
aggregative.KDEyML, aggregative.KDEyML,
aggregative.KDEyCS, aggregative.KDEyCS,
aggregative.KDEyHD, aggregative.KDEyHD,
aggregative.BayesianCC method.confidence.BayesianCC
} }
BINARY_METHODS = { BINARY_METHODS = {
@ -45,7 +46,7 @@ MULTICLASS_METHODS = {
aggregative.KDEyML, aggregative.KDEyML,
aggregative.KDEyCS, aggregative.KDEyCS,
aggregative.KDEyHD, aggregative.KDEyHD,
aggregative.BayesianCC method.confidence.BayesianCC
} }
NON_AGGREGATIVE_METHODS = { NON_AGGREGATIVE_METHODS = {

View File

@ -14,8 +14,6 @@ from quapy.functional import get_divergence
from quapy.classification.svmperf import SVMperf from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric
from quapy.method import _bayesian
# Abstract classes # Abstract classes
@ -808,99 +806,6 @@ class EMQ(AggregativeSoftQuantifier):
return qs, ps return qs, ps
class BayesianCC(AggregativeCrispQuantifier):
"""
`Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ method,
which is a variant of :class:`ACC` that calculates the posterior probability distribution
over the prevalence vectors, rather than providing a point estimate obtained
by matrix inversion.
Can be used to diagnose degeneracy in the predictions visible when the confusion
matrix has high condition number or to quantify uncertainty around the point estimate.
This method relies on extra dependencies, which have to be installed via:
`$ pip install quapy[bayes]`
:param classifier: a sklearn's Estimator that generates a classifier
:param val_split: a float in (0, 1) indicating the proportion of the training data to be used,
as a stratified held-out validation set, for generating classifier predictions.
:param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
:param num_samples: number of samples to draw from the posterior (default 1000)
:param mcmc_seed: random seed for the MCMC sampler (default 0)
"""
def __init__(self,
classifier: BaseEstimator=None,
val_split: float = 0.75,
num_warmup: int = 500,
num_samples: int = 1_000,
mcmc_seed: int = 0):
if num_warmup <= 0:
raise ValueError(f'parameter {num_warmup=} must be a positive integer')
if num_samples <= 0:
raise ValueError(f'parameter {num_samples=} must be a positive integer')
if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1:
raise ValueError(f'val_split must be a float in (0, 1), got {val_split}')
if _bayesian.DEPENDENCIES_INSTALLED is False:
raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.num_warmup = num_warmup
self.num_samples = num_samples
self.mcmc_seed = mcmc_seed
# Array of shape (n_classes, n_predicted_classes,) where entry (y, c) is the number of instances
# labeled as class y and predicted as class c.
# By default, this array is set to None and later defined as part of the `aggregation_fit` phase
self._n_and_c_labeled = None
# Dictionary with posterior samples, set when `aggregate` is provided.
self._samples = None
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
"""
Estimates the misclassification rates.
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
as instances, the label predictions issued by the classifier and, as labels, the true labels
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
"""
pred_labels, true_labels = classif_predictions.Xy
self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels, labels=self.classifier.classes_).astype(float)
def sample_from_posterior(self, classif_predictions):
if self._n_and_c_labeled is None:
raise ValueError("aggregation_fit must be called before sample_from_posterior")
n_c_unlabeled = F.counts_from_labels(classif_predictions, self.classifier.classes_).astype(float)
self._samples = _bayesian.sample_posterior(
n_c_unlabeled=n_c_unlabeled,
n_y_and_c_labeled=self._n_and_c_labeled,
num_warmup=self.num_warmup,
num_samples=self.num_samples,
seed=self.mcmc_seed,
)
return self._samples
def get_prevalence_samples(self):
if self._samples is None:
raise ValueError("sample_from_posterior must be called before get_prevalence_samples")
return self._samples[_bayesian.P_TEST_Y]
def get_conditional_probability_samples(self):
if self._samples is None:
raise ValueError("sample_from_posterior must be called before get_conditional_probability_samples")
return self._samples[_bayesian.P_C_COND_Y]
def aggregate(self, classif_predictions):
samples = self.sample_from_posterior(classif_predictions)[_bayesian.P_TEST_Y]
return np.asarray(samples.mean(axis=0), dtype=float)
class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
""" """
`Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy). `Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy).

View File

@ -1,6 +1,11 @@
import numpy as np import numpy as np
from sklearn.base import BaseEstimator
from sklearn.metrics import confusion_matrix
import quapy as qp import quapy as qp
import quapy.functional as F import quapy.functional as F
from method import _bayesian
from method.aggregative import AggregativeCrispQuantifier
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.aggregative import AggregativeQuantifier from quapy.method.aggregative import AggregativeQuantifier
from scipy.stats import chi2 from scipy.stats import chi2
@ -80,6 +85,7 @@ class WithConfidenceABC(ABC):
""" """
Abstract class for confidence regions. Abstract class for confidence regions.
""" """
METHODS = ['intervals', 'ellipse', 'ellipse-clr']
@abstractmethod @abstractmethod
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
@ -94,6 +100,30 @@ class WithConfidenceABC(ABC):
""" """
... ...
@classmethod
def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals'):
"""
Construct a confidence region given many prevalence estimations.
:param prev_estims: np.ndarray of shape (n_estims, n_classes)
:param confidence_level: float, the confidence level for the region (default 0.95)
:param method: str, indicates the method for constructing regions. Set to `intervals` for
constructing confidence intervals (default), or to `ellipse` for constructing an
ellipse in the probability simplex, or to `ellipse-clr` for constructing an ellipse
in the Centered-Log Ratio (CLR) unconstrained space.
"""
region = None
if method == 'intervals':
region = ConfidenceIntervals(prev_estims, confidence_level=confidence_level)
elif method == 'ellipse':
region = ConfidenceEllipseSimplex(prev_estims, confidence_level=confidence_level)
elif method == 'ellipse-clr':
region = ConfidenceEllipseCLR(prev_estims, confidence_level=confidence_level)
if region is None:
raise NotImplementedError(f'unknown method {method}')
return region
def simplex_volume(n): def simplex_volume(n):
""" """
@ -239,7 +269,10 @@ class ConfidenceIntervals(ConfidenceRegionABC):
X = np.asarray(X) X = np.asarray(X)
self.means_ = X.mean(axis=0) self.means_ = X.mean(axis=0)
self.I_low, self.I_high = np.percentile(X, q=[2.5, 97.5], axis=0) alpha = 1-confidence_level
low_perc = (alpha/2.)*100
high_perc = (1-alpha/2.)*100
self.I_low, self.I_high = np.percentile(X, q=[low_perc, high_perc], axis=0)
def point_estimate(self): def point_estimate(self):
""" """
@ -312,20 +345,18 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
:para n_test_samples: int, the number of test resamplings (defaults to 500, set to > 1 to activate a :para n_test_samples: int, the number of test resamplings (defaults to 500, set to > 1 to activate a
population-based bootstrap approach) population-based bootstrap approach)
:param confidence_level: float, the confidence level for the confidence region (default 0.95) :param confidence_level: float, the confidence level for the confidence region (default 0.95)
:param method: string, set to `intervals` for constructing confidence intervals (default), or to :param region: string, set to `intervals` for constructing confidence intervals (default), or to
`ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space. constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
:param random_state: int for replicating samples, None (default) for non-replicable samples :param random_state: int for replicating samples, None (default) for non-replicable samples
""" """
METHODS = ['intervals', 'ellipse', 'ellipse-clr']
def __init__(self, def __init__(self,
quantifier: AggregativeQuantifier, quantifier: AggregativeQuantifier,
n_train_samples=1, n_train_samples=1,
n_test_samples=500, n_test_samples=500,
confidence_level=0.95, confidence_level=0.95,
method='intervals', region='intervals',
random_state=None): random_state=None):
assert isinstance(quantifier, AggregativeQuantifier), \ assert isinstance(quantifier, AggregativeQuantifier), \
@ -336,30 +367,14 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
f'{n_test_samples=} must be >= 1' f'{n_test_samples=} must be >= 1'
assert n_test_samples>1 or n_train_samples>1, \ assert n_test_samples>1 or n_train_samples>1, \
f'either {n_test_samples=} or {n_train_samples=} must be >1' f'either {n_test_samples=} or {n_train_samples=} must be >1'
assert method in self.METHODS, \
f'unknown method; valid ones are {self.METHODS}'
self.quantifier = quantifier self.quantifier = quantifier
self.n_train_samples = n_train_samples self.n_train_samples = n_train_samples
self.n_test_samples = n_test_samples self.n_test_samples = n_test_samples
self.confidence_level = confidence_level self.confidence_level = confidence_level
self.method = method self.region = region
self.random_state = random_state self.random_state = random_state
def _return_conf(self, prevs, confidence_level):
region = None
if self.method == 'intervals':
region = ConfidenceIntervals(prevs, confidence_level=confidence_level)
elif self.method == 'ellipse':
region = ConfidenceEllipseSimplex(prevs, confidence_level=confidence_level)
elif self.method == 'ellipse-clr':
region = ConfidenceEllipseCLR(prevs, confidence_level=confidence_level)
if region is None:
raise NotImplementedError(f'unknown method {self.method}')
return region
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.quantifiers = [] self.quantifiers = []
if self.n_train_samples==1: if self.n_train_samples==1:
@ -395,7 +410,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
prev_i = quantifier.aggregate(sample_i) prev_i = quantifier.aggregate(sample_i)
prevs.append(prev_i) prevs.append(prev_i)
conf = self._return_conf(prevs, confidence_level) conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region)
prev_estim = conf.point_estimate() prev_estim = conf.point_estimate()
return prev_estim, conf return prev_estim, conf
@ -416,3 +431,111 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
def _classifier_method(self): def _classifier_method(self):
return self.quantifier._classifier_method() return self.quantifier._classifier_method()
class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
"""
`Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ method,
which is a variant of :class:`ACC` that calculates the posterior probability distribution
over the prevalence vectors, rather than providing a point estimate obtained
by matrix inversion.
Can be used to diagnose degeneracy in the predictions visible when the confusion
matrix has high condition number or to quantify uncertainty around the point estimate.
This method relies on extra dependencies, which have to be installed via:
`$ pip install quapy[bayes]`
:param classifier: a sklearn's Estimator that generates a classifier
:param val_split: a float in (0, 1) indicating the proportion of the training data to be used,
as a stratified held-out validation set, for generating classifier predictions.
:param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
:param num_samples: number of samples to draw from the posterior (default 1000)
:param mcmc_seed: random seed for the MCMC sampler (default 0)
:param confidence_level: float in [0,1] to construct a confidence region around the point estimate (default 0.95)
:param region: string, set to `intervals` for constructing confidence intervals (default), or to
`ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
"""
def __init__(self,
classifier: BaseEstimator=None,
val_split: int = 5,
num_warmup: int = 500,
num_samples: int = 1_000,
mcmc_seed: int = 0,
confidence_level: float = 0.95,
region: str = 'intervals'):
if num_warmup <= 0:
raise ValueError(f'parameter {num_warmup=} must be a positive integer')
if num_samples <= 0:
raise ValueError(f'parameter {num_samples=} must be a positive integer')
# if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1:
# raise ValueError(f'val_split must be a float in (0, 1), got {val_split}')
if _bayesian.DEPENDENCIES_INSTALLED is False:
raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.num_warmup = num_warmup
self.num_samples = num_samples
self.mcmc_seed = mcmc_seed
self.confidence_level = confidence_level
self.region = region
# Array of shape (n_classes, n_predicted_classes,) where entry (y, c) is the number of instances
# labeled as class y and predicted as class c.
# By default, this array is set to None and later defined as part of the `aggregation_fit` phase
self._n_and_c_labeled = None
# Dictionary with posterior samples, set when `aggregate` is provided.
self._samples = None
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
"""
Estimates the misclassification rates.
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
as instances, the label predictions issued by the classifier and, as labels, the true labels
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
"""
pred_labels, true_labels = classif_predictions.Xy
self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels, labels=self.classifier.classes_).astype(float)
def sample_from_posterior(self, classif_predictions):
if self._n_and_c_labeled is None:
raise ValueError("aggregation_fit must be called before sample_from_posterior")
n_c_unlabeled = F.counts_from_labels(classif_predictions, self.classifier.classes_).astype(float)
self._samples = _bayesian.sample_posterior(
n_c_unlabeled=n_c_unlabeled,
n_y_and_c_labeled=self._n_and_c_labeled,
num_warmup=self.num_warmup,
num_samples=self.num_samples,
seed=self.mcmc_seed,
)
return self._samples
def get_prevalence_samples(self):
if self._samples is None:
raise ValueError("sample_from_posterior must be called before get_prevalence_samples")
return self._samples[_bayesian.P_TEST_Y]
def get_conditional_probability_samples(self):
if self._samples is None:
raise ValueError("sample_from_posterior must be called before get_conditional_probability_samples")
return self._samples[_bayesian.P_C_COND_Y]
def aggregate(self, classif_predictions):
samples = self.sample_from_posterior(classif_predictions)[_bayesian.P_TEST_Y]
return np.asarray(samples.mean(axis=0), dtype=float)
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
classif_predictions = self.classify(instances)
point_estimate = self.aggregate(classif_predictions)
samples = self.get_prevalence_samples() # available after calling "aggregate" function
region = WithConfidenceABC.construct_region(samples, confidence_level=self.confidence_level, method=self.region)
return point_estimate, region