From a0c84c551031326711b586c0d22c6717c594329a Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Fri, 29 Nov 2024 13:46:46 +0100
Subject: [PATCH] documented confidence.py

---
 quapy/method/confidence.py | 193 ++++++++++++++++++++++++++++++-------
 1 file changed, 160 insertions(+), 33 deletions(-)

diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py
index aa9336f..66d8d8c 100644
--- a/quapy/method/confidence.py
+++ b/quapy/method/confidence.py
@@ -1,38 +1,75 @@
-from functools import cached_property
 import numpy as np
 import quapy as qp
 import quapy.functional as F
 from quapy.data import LabelledCollection
 from quapy.method.aggregative import AggregativeQuantifier
 from scipy.stats import chi2
-from scipy.special import gamma
 from sklearn.utils import resample
 from abc import ABC, abstractmethod
 from scipy.special import softmax, factorial
 import copy
 from functools import lru_cache
 
-
+"""
+This module provides implementation of different types of confidence regions, and the implementation of Bootstrap
+for AggregativeQuantifiers.
+"""
 
 class ConfidenceRegionABC(ABC):
+    """
+    Abstract class of confidence regions
+    """
 
     @abstractmethod
     def point_estimate(self) -> np.ndarray:
+        """
+        Returns the point estimate corresponding to a set of bootstrap estimates.
+
+        :return: np.ndarray
+        """
         ...
 
-    def ndim(self):
+    def ndim(self) -> int:
+        """
+        Number of dimensions of the region. This number corresponds to the total number of classes. The dimensionality
+        of the simplex is therefore ndim-1
+
+        :return: int
+        """
         return len(self.point_estimate())
 
     @abstractmethod
-    def coverage(self, true_value):
+    def coverage(self, true_value) -> float:
+        """
+        Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
+        fraction of these that are contained in the region, if more than one value is passed. If only one value is
+        passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
+
+        :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
+        :return: float in [0,1]
+        """
         ...
 
     @lru_cache
     def simplex_portion(self):
+        """
+        Computes the fraction of the simplex which is covered by the region. This is not the volume of the region
+        itself (which could lie outside the boundaries of the simplex), but the actual fraction of the simplex
+        contained in the region. A default implementation, based on Monte Carlo approximation, is provided.
+
+        :return: float, the fraction of the simplex covered by the region
+        """
         return self.montecarlo_proportion()
 
     @lru_cache
     def montecarlo_proportion(self, n_trials=10_000):
+        """
+        Estimates, via a Monte Carlo approach, the fraction of the simplex covered by the region. This is carried
+        out by returning the fraction of the `n_trials` points, uniformly drawn at random from the simplex, that
+        are included in the region. The value is only computed once when multiple calls are made.
+
+        :return: float in [0,1]
+        """
         with qp.util.temp_seed(0):
             uniform_simplex = F.uniform_simplex_sampling(n_classes=self.ndim(), size=n_trials)
         proportion = np.clip(self.coverage(uniform_simplex), 0., 1.)
@@ -40,12 +77,32 @@ class ConfidenceRegionABC(ABC):
 
 
 class WithConfidenceABC(ABC):
+    """
+    Abstract class for confidence regions.
+    """
+
     @abstractmethod
     def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
+        """
+        Adds the method `quantify_conf` to the interface. This method returns not only the point-estimate, but
+        also the confidence region around it.
+
+        :param instances: a np.ndarray of shape (n_instances, n_features,)
+        :confidence_level: float in (0, 1)
+        :return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape
+            (n_classes,) and  `conf_region` is an object from :class:`ConfidenceRegionABC`
+        """
         ...
 
 
 def simplex_volume(n):
+    """
+    Computes the volume of the n-dimensional simplex. For n classes, the corresponding volume
+    is :meth:`simplex_volume(n-1)` since the simplex has one degree of freedom less.
+
+    :param n: int, the dimensionality of the simplex
+    :return: float, the volume of the n-dimensional simplex
+    """
     return 1 / factorial(n)
 
 
@@ -54,17 +111,16 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
     Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix`
     at a distance `chi2_critical`.
 
-    :param values: a np.ndarray with shape (ndim,) or (n_values,ndim,)
-    :param mean: a np.ndarray with the mean of the sample
+    :param values: a np.ndarray of shape (n_dim,) or (n_values, n_dim,)
+    :param mean: a np.ndarray of shape (n_dim,) with the center of the ellipse
     :param prec_matrix: a np.ndarray with the precision matrix (inverse of the
-        covariance matrix) of the sample. If this inverse cannot be computed
+        covariance matrix) of the ellipse. If this inverse cannot be computed
         then None must be passed
-    :param chi2_critical: the chi2 critical value
+    :param chi2_critical: float, the chi2 critical value
 
-    :return: the fraction of values that are contained in the ellipse
-        defined by the mean, the precision matrix, and the chi2_critical.
-        If values is only one value, then either 0 (not contained) or
-        1 (contained) is returned.
+    :return: float in [0,1], the fraction of values that are contained in the ellipse
+        defined by the mean (center), the precision matrix (shape), and the chi2_critical value (distance).
+        If `values` is only one value, then either 0. (not contained) or 1. (contained) is returned.
     """
     if prec_matrix is None:
         return 0.
@@ -84,6 +140,12 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
 
 
 class ConfidenceEllipseSimplex(ConfidenceRegionABC):
+    """
+    Instantiates a Confidence Ellipse in the probability simplex.
+
+    :param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
+    :param confidence_level: float, the confidence level (default 0.95)
+    """
 
     def __init__(self, X, confidence_level=0.95):
 
@@ -107,20 +169,32 @@ class ConfidenceEllipseSimplex(ConfidenceRegionABC):
         self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof)
 
     def point_estimate(self):
+        """
+        Returns the point estimate, the center of the ellipse.
+
+        :return: np.ndarray of shape (n_classes,)
+        """
         return self.mean_
 
     def coverage(self, true_value):
         """
-        true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
-        confidence_level None means that the confidence_level is taken from the __init__
-        returns true or false depending on whether true_value is in the ellipse or not,
-            or returns the proportion of true_values that are within the ellipse if more
-            than one are passed
+        Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
+        fraction of these that are contained in the region, if more than one value is passed. If only one value is
+        passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
+
+        :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
+        :return: float in [0,1]
         """
         return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_)
 
 
 class ConfidenceEllipseCLR(ConfidenceRegionABC):
+    """
+    Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space.
+
+    :param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
+    :param confidence_level: float, the confidence level (default 0.95)
+    """
 
     def __init__(self, X, confidence_level=0.95):
         self.clr = CLRtransformation()
@@ -129,26 +203,36 @@ class ConfidenceEllipseCLR(ConfidenceRegionABC):
         self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level)
 
     def point_estimate(self):
-        # Z_mean = self.conf_region_clr.mean()
-        # return self.clr.inverse(Z_mean)
-        # the inverse of the CLR does not coincide with the clean mean because the geometric mean
-        # requires smoothing the prevalence vectors and this affects the softmax (inverse)
+        """
+        Returns the point estimate, the center of the ellipse.
+
+        :return: np.ndarray of shape (n_classes,)
+        """
+        # The inverse of the CLR does not coincide with the true mean, because the geometric mean
+        # requires smoothing the prevalence vectors and this affects the softmax (inverse);
+        # return self.clr.inverse(self.mean_) # <- does not coincide
         return self.mean_
 
     def coverage(self, true_value):
         """
-        true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
-        confidence_level None means that the confidence_level is taken from the __init__
-        returns true or false depending on whether true_value is in the ellipse or not,
-            or returns the proportion of true_values that are within the ellipse if more
-            than one are passed
+        Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
+        fraction of these that are contained in the region, if more than one value is passed. If only one value is
+        passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
+
+        :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
+        :return: float in [0,1]
         """
         transformed_values = self.clr(true_value)
         return self.conf_region_clr.coverage(transformed_values)
 
 
 class ConfidenceIntervals(ConfidenceRegionABC):
+    """
+    Instantiates a region based on (independent) Confidence Intervals.
 
+    :param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
+    :param confidence_level: float, the confidence level (default 0.95)
+    """
     def __init__(self, X, confidence_level=0.95):
         assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)'
 
@@ -158,14 +242,21 @@ class ConfidenceIntervals(ConfidenceRegionABC):
         self.I_low, self.I_high = np.percentile(X, q=[2.5, 97.5], axis=0)
 
     def point_estimate(self):
+        """
+        Returns the point estimate, the class-wise average of the bootstrapped estimates
+
+        :return: np.ndarray of shape (n_classes,)
+        """
         return self.means_
 
     def coverage(self, true_value):
         """
-        true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
-        returns true or false depending on whether true_value is in the ellipse or not,
-            or returns the proportion of true_values that are within the ellipse if more
-            than one are passed
+        Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
+        fraction of these that are contained in the region, if more than one value is passed. If only one value is
+        passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
+
+        :param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
+        :return: float in [0,1]
         """
         within_intervals = np.logical_and(self.I_low <= true_value, true_value <= self.I_high)
         within_all_intervals = np.all(within_intervals, axis=-1, keepdims=True)
@@ -176,20 +267,56 @@ class ConfidenceIntervals(ConfidenceRegionABC):
 
 class CLRtransformation:
     """
-    Centered log-ratio
+    Centered log-ratio, from component analysis
     """
-
     def __call__(self, X, epsilon=1e-6):
+        """
+        Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but
+        actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}`
+
+        :param X: np.ndarray of (n_instances, n_dimensions) to be transformed
+        :param epsilon: small float for prevalence smoothing
+        :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
+        """
         X = np.asarray(X)
         X = qp.error.smooth(X, epsilon)
         G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True))  # geometric mean
         return np.log(X / G)
 
     def inverse(self, X):
+        """
+        Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing.
+
+        :param X: np.ndarray of (n_instances, n_dimensions) to be transformed
+        :return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
+        """
         return softmax(X, axis=-1)
 
 
 class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
+    """
+    Aggregative Bootstrap allows any AggregativeQuantifier to get confidence regions around
+    point-estimates of class prevalence values. This method implements some optimizations for
+    speeding up the computations, which are only possible due to the two phases of the aggregative
+    quantifiers.
+
+    During training, the bootstrap repetitions are only carried out over pre-classified training instances,
+    after the classifier has been trained (only once), in order to train a series of aggregation
+    functions (model-based approach).
+
+    During inference, the bootstrap repetitions are applied to the pre-classified test instances.
+
+    :param quantifier: an aggregative quantifier
+    :para n_train_samples: int, the number of training resamplings (defaults to 1, set to > 1 to activate a
+        model-based bootstrap approach)
+    :para n_test_samples: int, the number of test resamplings (defaults to 500, set to > 1 to activate a
+        population-based bootstrap approach)
+    :param confidence_level: float, the confidence level for the confidence region (default 0.95)
+    :param method: string, set to `intervals` for constructing confidence intervals (default), or to
+        `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
+        constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
+    :param random_state: int for replicating samples, None (default) for non-replicable samples
+    """
 
     METHODS = ['intervals', 'ellipse', 'ellipse-clr']