documented confidence.py

2024-11-29 13:46:46 +01:00 · 2024-11-29 13:46:46 +01:00 · a0c84c5510
parent ce4c0006d5
commit a0c84c5510
1 changed files with 160 additions and 33 deletions
--- a/quapy/method/confidence.py
+++ b/quapy/method/confidence.py
@ -1,38 +1,75 @@
 from functools import cached_property
 import numpy as np
 import quapy as qp
 import quapy.functional as F
 from quapy.data import LabelledCollection
 from quapy.method.aggregative import AggregativeQuantifier
 from scipy.stats import chi2
 from scipy.special import gamma
 from sklearn.utils import resample
 from abc import ABC, abstractmethod
 from scipy.special import softmax, factorial
 import copy
 from functools import lru_cache
-
+"""
 This module provides implementation of different types of confidence regions, and the implementation of Bootstrap
 for AggregativeQuantifiers.
 """
 class ConfidenceRegionABC(ABC):
    """
    Abstract class of confidence regions
    """
    @abstractmethod
    def point_estimate(self) -> np.ndarray:
        """
        Returns the point estimate corresponding to a set of bootstrap estimates.
        :return: np.ndarray
        """
        ...
-    def ndim(self):
+    def ndim(self) -> int:
        """
        Number of dimensions of the region. This number corresponds to the total number of classes. The dimensionality
        of the simplex is therefore ndim-1
        :return: int
        """
        return len(self.point_estimate())
    @abstractmethod
-    def coverage(self, true_value):
+    def coverage(self, true_value) -> float:
        """
        Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
        fraction of these that are contained in the region, if more than one value is passed. If only one value is
        passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
        :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
        :return: float in [0,1]
        """
        ...
    @lru_cache
    def simplex_portion(self):
        """
        Computes the fraction of the simplex which is covered by the region. This is not the volume of the region
        itself (which could lie outside the boundaries of the simplex), but the actual fraction of the simplex
        contained in the region. A default implementation, based on Monte Carlo approximation, is provided.
        :return: float, the fraction of the simplex covered by the region
        """
        return self.montecarlo_proportion()
    @lru_cache
    def montecarlo_proportion(self, n_trials=10_000):
        """
        Estimates, via a Monte Carlo approach, the fraction of the simplex covered by the region. This is carried
        out by returning the fraction of the `n_trials` points, uniformly drawn at random from the simplex, that
        are included in the region. The value is only computed once when multiple calls are made.
        :return: float in [0,1]
        """
        with qp.util.temp_seed(0):
            uniform_simplex = F.uniform_simplex_sampling(n_classes=self.ndim(), size=n_trials)
        proportion = np.clip(self.coverage(uniform_simplex), 0., 1.)
@ -40,12 +77,32 @@ class ConfidenceRegionABC(ABC):
 class WithConfidenceABC(ABC):
    """
    Abstract class for confidence regions.
    """
    @abstractmethod
    def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
        """
        Adds the method `quantify_conf` to the interface. This method returns not only the point-estimate, but
        also the confidence region around it.
        :param instances: a np.ndarray of shape (n_instances, n_features,)
        :confidence_level: float in (0, 1)
        :return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape
            (n_classes,) and  `conf_region` is an object from :class:`ConfidenceRegionABC`
        """
        ...
 def simplex_volume(n):
    """
    Computes the volume of the n-dimensional simplex. For n classes, the corresponding volume
    is :meth:`simplex_volume(n-1)` since the simplex has one degree of freedom less.
    :param n: int, the dimensionality of the simplex
    :return: float, the volume of the n-dimensional simplex
    """
    return 1 / factorial(n)
@ -54,17 +111,16 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
    Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix`
    at a distance `chi2_critical`.
-    :param values: a np.ndarray with shape (ndim,) or (n_values,ndim,)
+    :param values: a np.ndarray of shape (n_dim,) or (n_values, n_dim,)
-    :param mean: a np.ndarray with the mean of the sample
+    :param mean: a np.ndarray of shape (n_dim,) with the center of the ellipse
    :param prec_matrix: a np.ndarray with the precision matrix (inverse of the
-        covariance matrix) of the sample. If this inverse cannot be computed
+        covariance matrix) of the ellipse. If this inverse cannot be computed
        then None must be passed
-    :param chi2_critical: the chi2 critical value
+    :param chi2_critical: float, the chi2 critical value
-    :return: the fraction of values that are contained in the ellipse
+    :return: float in [0,1], the fraction of values that are contained in the ellipse
-        defined by the mean, the precision matrix, and the chi2_critical.
+        defined by the mean (center), the precision matrix (shape), and the chi2_critical value (distance).
-        If values is only one value, then either 0 (not contained) or
+        If `values` is only one value, then either 0. (not contained) or 1. (contained) is returned.
        1 (contained) is returned.
    """
    if prec_matrix is None:
        return 0.
@ -84,6 +140,12 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
 class ConfidenceEllipseSimplex(ConfidenceRegionABC):
    """
    Instantiates a Confidence Ellipse in the probability simplex.
    :param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
    :param confidence_level: float, the confidence level (default 0.95)
    """
    def __init__(self, X, confidence_level=0.95):
@ -107,20 +169,32 @@ class ConfidenceEllipseSimplex(ConfidenceRegionABC):
        self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof)
    def point_estimate(self):
        """
        Returns the point estimate, the center of the ellipse.
        :return: np.ndarray of shape (n_classes,)
        """
        return self.mean_
    def coverage(self, true_value):
        """
-        true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
+        Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
-        confidence_level None means that the confidence_level is taken from the __init__
+        fraction of these that are contained in the region, if more than one value is passed. If only one value is
-        returns true or false depending on whether true_value is in the ellipse or not,
+        passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
-            or returns the proportion of true_values that are within the ellipse if more
+
-            than one are passed
+        :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
        :return: float in [0,1]
        """
        return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_)
 class ConfidenceEllipseCLR(ConfidenceRegionABC):
    """
    Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space.
    :param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
    :param confidence_level: float, the confidence level (default 0.95)
    """
    def __init__(self, X, confidence_level=0.95):
        self.clr = CLRtransformation()
@ -129,26 +203,36 @@ class ConfidenceEllipseCLR(ConfidenceRegionABC):
        self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level)
    def point_estimate(self):
-        # Z_mean = self.conf_region_clr.mean()
+        """
-        # return self.clr.inverse(Z_mean)
+        Returns the point estimate, the center of the ellipse.
-        # the inverse of the CLR does not coincide with the clean mean because the geometric mean
+
-        # requires smoothing the prevalence vectors and this affects the softmax (inverse)
+        :return: np.ndarray of shape (n_classes,)
        """
        # The inverse of the CLR does not coincide with the true mean, because the geometric mean
        # requires smoothing the prevalence vectors and this affects the softmax (inverse);
        # return self.clr.inverse(self.mean_) # <- does not coincide
        return self.mean_
    def coverage(self, true_value):
        """
-        true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
+        Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
-        confidence_level None means that the confidence_level is taken from the __init__
+        fraction of these that are contained in the region, if more than one value is passed. If only one value is
-        returns true or false depending on whether true_value is in the ellipse or not,
+        passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
-            or returns the proportion of true_values that are within the ellipse if more
+
-            than one are passed
+        :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
        :return: float in [0,1]
        """
        transformed_values = self.clr(true_value)
        return self.conf_region_clr.coverage(transformed_values)
 class ConfidenceIntervals(ConfidenceRegionABC):
    """
    Instantiates a region based on (independent) Confidence Intervals.
    :param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
    :param confidence_level: float, the confidence level (default 0.95)
    """
    def __init__(self, X, confidence_level=0.95):
        assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)'
@ -158,14 +242,21 @@ class ConfidenceIntervals(ConfidenceRegionABC):
        self.I_low, self.I_high = np.percentile(X, q=[2.5, 97.5], axis=0)
    def point_estimate(self):
        """
        Returns the point estimate, the class-wise average of the bootstrapped estimates
        :return: np.ndarray of shape (n_classes,)
        """
        return self.means_
    def coverage(self, true_value):
        """
-        true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
+        Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
-        returns true or false depending on whether true_value is in the ellipse or not,
+        fraction of these that are contained in the region, if more than one value is passed. If only one value is
-            or returns the proportion of true_values that are within the ellipse if more
+        passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
-            than one are passed
+
        :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
        :return: float in [0,1]
        """
        within_intervals = np.logical_and(self.I_low <= true_value, true_value <= self.I_high)
        within_all_intervals = np.all(within_intervals, axis=-1, keepdims=True)
@ -176,20 +267,56 @@ class ConfidenceIntervals(ConfidenceRegionABC):
 class CLRtransformation:
    """
-    Centered log-ratio
+    Centered log-ratio, from component analysis
    """
    def __call__(self, X, epsilon=1e-6):
        """
        Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but
        actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}`
        :param X: np.ndarray of (n_instances, n_dimensions) to be transformed
        :param epsilon: small float for prevalence smoothing
        :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
        """
        X = np.asarray(X)
        X = qp.error.smooth(X, epsilon)
        G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True))  # geometric mean
        return np.log(X / G)
    def inverse(self, X):
        """
        Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing.
        :param X: np.ndarray of (n_instances, n_dimensions) to be transformed
        :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
        """
        return softmax(X, axis=-1)
 class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
    """
    Aggregative Bootstrap allows any AggregativeQuantifier to get confidence regions around
    point-estimates of class prevalence values. This method implements some optimizations for
    speeding up the computations, which are only possible due to the two phases of the aggregative
    quantifiers.
    During training, the bootstrap repetitions are only carried out over pre-classified training instances,
    after the classifier has been trained (only once), in order to train a series of aggregation
    functions (model-based approach).
    During inference, the bootstrap repetitions are applied to the pre-classified test instances.
    :param quantifier: an aggregative quantifier
    :para n_train_samples: int, the number of training resamplings (defaults to 1, set to > 1 to activate a
        model-based bootstrap approach)
    :para n_test_samples: int, the number of test resamplings (defaults to 500, set to > 1 to activate a
        population-based bootstrap approach)
    :param confidence_level: float, the confidence level for the confidence region (default 0.95)
    :param method: string, set to `intervals` for constructing confidence intervals (default), or to
        `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
        constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
    :param random_state: int for replicating samples, None (default) for non-replicable samples
    """
    METHODS = ['intervals', 'ellipse', 'ellipse-clr']