documented confidence.py
This commit is contained in:
parent
ce4c0006d5
commit
a0c84c5510
|
|
@ -1,38 +1,75 @@
|
||||||
from functools import cached_property
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.method.aggregative import AggregativeQuantifier
|
from quapy.method.aggregative import AggregativeQuantifier
|
||||||
from scipy.stats import chi2
|
from scipy.stats import chi2
|
||||||
from scipy.special import gamma
|
|
||||||
from sklearn.utils import resample
|
from sklearn.utils import resample
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from scipy.special import softmax, factorial
|
from scipy.special import softmax, factorial
|
||||||
import copy
|
import copy
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
|
||||||
|
"""
|
||||||
|
This module provides implementation of different types of confidence regions, and the implementation of Bootstrap
|
||||||
|
for AggregativeQuantifiers.
|
||||||
|
"""
|
||||||
|
|
||||||
class ConfidenceRegionABC(ABC):
|
class ConfidenceRegionABC(ABC):
|
||||||
|
"""
|
||||||
|
Abstract class of confidence regions
|
||||||
|
"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def point_estimate(self) -> np.ndarray:
|
def point_estimate(self) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Returns the point estimate corresponding to a set of bootstrap estimates.
|
||||||
|
|
||||||
|
:return: np.ndarray
|
||||||
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
def ndim(self):
|
def ndim(self) -> int:
|
||||||
|
"""
|
||||||
|
Number of dimensions of the region. This number corresponds to the total number of classes. The dimensionality
|
||||||
|
of the simplex is therefore ndim-1
|
||||||
|
|
||||||
|
:return: int
|
||||||
|
"""
|
||||||
return len(self.point_estimate())
|
return len(self.point_estimate())
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def coverage(self, true_value):
|
def coverage(self, true_value) -> float:
|
||||||
|
"""
|
||||||
|
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
|
||||||
|
fraction of these that are contained in the region, if more than one value is passed. If only one value is
|
||||||
|
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
|
||||||
|
|
||||||
|
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
|
||||||
|
:return: float in [0,1]
|
||||||
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
@lru_cache
|
@lru_cache
|
||||||
def simplex_portion(self):
|
def simplex_portion(self):
|
||||||
|
"""
|
||||||
|
Computes the fraction of the simplex which is covered by the region. This is not the volume of the region
|
||||||
|
itself (which could lie outside the boundaries of the simplex), but the actual fraction of the simplex
|
||||||
|
contained in the region. A default implementation, based on Monte Carlo approximation, is provided.
|
||||||
|
|
||||||
|
:return: float, the fraction of the simplex covered by the region
|
||||||
|
"""
|
||||||
return self.montecarlo_proportion()
|
return self.montecarlo_proportion()
|
||||||
|
|
||||||
@lru_cache
|
@lru_cache
|
||||||
def montecarlo_proportion(self, n_trials=10_000):
|
def montecarlo_proportion(self, n_trials=10_000):
|
||||||
|
"""
|
||||||
|
Estimates, via a Monte Carlo approach, the fraction of the simplex covered by the region. This is carried
|
||||||
|
out by returning the fraction of the `n_trials` points, uniformly drawn at random from the simplex, that
|
||||||
|
are included in the region. The value is only computed once when multiple calls are made.
|
||||||
|
|
||||||
|
:return: float in [0,1]
|
||||||
|
"""
|
||||||
with qp.util.temp_seed(0):
|
with qp.util.temp_seed(0):
|
||||||
uniform_simplex = F.uniform_simplex_sampling(n_classes=self.ndim(), size=n_trials)
|
uniform_simplex = F.uniform_simplex_sampling(n_classes=self.ndim(), size=n_trials)
|
||||||
proportion = np.clip(self.coverage(uniform_simplex), 0., 1.)
|
proportion = np.clip(self.coverage(uniform_simplex), 0., 1.)
|
||||||
|
|
@ -40,12 +77,32 @@ class ConfidenceRegionABC(ABC):
|
||||||
|
|
||||||
|
|
||||||
class WithConfidenceABC(ABC):
|
class WithConfidenceABC(ABC):
|
||||||
|
"""
|
||||||
|
Abstract class for confidence regions.
|
||||||
|
"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
|
def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC):
|
||||||
|
"""
|
||||||
|
Adds the method `quantify_conf` to the interface. This method returns not only the point-estimate, but
|
||||||
|
also the confidence region around it.
|
||||||
|
|
||||||
|
:param instances: a np.ndarray of shape (n_instances, n_features,)
|
||||||
|
:confidence_level: float in (0, 1)
|
||||||
|
:return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape
|
||||||
|
(n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC`
|
||||||
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
def simplex_volume(n):
|
def simplex_volume(n):
|
||||||
|
"""
|
||||||
|
Computes the volume of the n-dimensional simplex. For n classes, the corresponding volume
|
||||||
|
is :meth:`simplex_volume(n-1)` since the simplex has one degree of freedom less.
|
||||||
|
|
||||||
|
:param n: int, the dimensionality of the simplex
|
||||||
|
:return: float, the volume of the n-dimensional simplex
|
||||||
|
"""
|
||||||
return 1 / factorial(n)
|
return 1 / factorial(n)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -54,17 +111,16 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
|
||||||
Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix`
|
Checks the proportion of values that belong to the ellipse with center `mean` and precision matrix `prec_matrix`
|
||||||
at a distance `chi2_critical`.
|
at a distance `chi2_critical`.
|
||||||
|
|
||||||
:param values: a np.ndarray with shape (ndim,) or (n_values,ndim,)
|
:param values: a np.ndarray of shape (n_dim,) or (n_values, n_dim,)
|
||||||
:param mean: a np.ndarray with the mean of the sample
|
:param mean: a np.ndarray of shape (n_dim,) with the center of the ellipse
|
||||||
:param prec_matrix: a np.ndarray with the precision matrix (inverse of the
|
:param prec_matrix: a np.ndarray with the precision matrix (inverse of the
|
||||||
covariance matrix) of the sample. If this inverse cannot be computed
|
covariance matrix) of the ellipse. If this inverse cannot be computed
|
||||||
then None must be passed
|
then None must be passed
|
||||||
:param chi2_critical: the chi2 critical value
|
:param chi2_critical: float, the chi2 critical value
|
||||||
|
|
||||||
:return: the fraction of values that are contained in the ellipse
|
:return: float in [0,1], the fraction of values that are contained in the ellipse
|
||||||
defined by the mean, the precision matrix, and the chi2_critical.
|
defined by the mean (center), the precision matrix (shape), and the chi2_critical value (distance).
|
||||||
If values is only one value, then either 0 (not contained) or
|
If `values` is only one value, then either 0. (not contained) or 1. (contained) is returned.
|
||||||
1 (contained) is returned.
|
|
||||||
"""
|
"""
|
||||||
if prec_matrix is None:
|
if prec_matrix is None:
|
||||||
return 0.
|
return 0.
|
||||||
|
|
@ -84,6 +140,12 @@ def within_ellipse_prop(values, mean, prec_matrix, chi2_critical):
|
||||||
|
|
||||||
|
|
||||||
class ConfidenceEllipseSimplex(ConfidenceRegionABC):
|
class ConfidenceEllipseSimplex(ConfidenceRegionABC):
|
||||||
|
"""
|
||||||
|
Instantiates a Confidence Ellipse in the probability simplex.
|
||||||
|
|
||||||
|
:param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||||
|
:param confidence_level: float, the confidence level (default 0.95)
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, X, confidence_level=0.95):
|
def __init__(self, X, confidence_level=0.95):
|
||||||
|
|
||||||
|
|
@ -107,20 +169,32 @@ class ConfidenceEllipseSimplex(ConfidenceRegionABC):
|
||||||
self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof)
|
self.chi2_critical_ = chi2.ppf(confidence_level, df=self.ddof)
|
||||||
|
|
||||||
def point_estimate(self):
|
def point_estimate(self):
|
||||||
|
"""
|
||||||
|
Returns the point estimate, the center of the ellipse.
|
||||||
|
|
||||||
|
:return: np.ndarray of shape (n_classes,)
|
||||||
|
"""
|
||||||
return self.mean_
|
return self.mean_
|
||||||
|
|
||||||
def coverage(self, true_value):
|
def coverage(self, true_value):
|
||||||
"""
|
"""
|
||||||
true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
|
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
|
||||||
confidence_level None means that the confidence_level is taken from the __init__
|
fraction of these that are contained in the region, if more than one value is passed. If only one value is
|
||||||
returns true or false depending on whether true_value is in the ellipse or not,
|
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
|
||||||
or returns the proportion of true_values that are within the ellipse if more
|
|
||||||
than one are passed
|
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
|
||||||
|
:return: float in [0,1]
|
||||||
"""
|
"""
|
||||||
return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_)
|
return within_ellipse_prop(true_value, self.mean_, self.precision_matrix_, self.chi2_critical_)
|
||||||
|
|
||||||
|
|
||||||
class ConfidenceEllipseCLR(ConfidenceRegionABC):
|
class ConfidenceEllipseCLR(ConfidenceRegionABC):
|
||||||
|
"""
|
||||||
|
Instantiates a Confidence Ellipse in the Centered-Log Ratio (CLR) space.
|
||||||
|
|
||||||
|
:param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||||
|
:param confidence_level: float, the confidence level (default 0.95)
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, X, confidence_level=0.95):
|
def __init__(self, X, confidence_level=0.95):
|
||||||
self.clr = CLRtransformation()
|
self.clr = CLRtransformation()
|
||||||
|
|
@ -129,26 +203,36 @@ class ConfidenceEllipseCLR(ConfidenceRegionABC):
|
||||||
self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level)
|
self.conf_region_clr = ConfidenceEllipseSimplex(Z, confidence_level=confidence_level)
|
||||||
|
|
||||||
def point_estimate(self):
|
def point_estimate(self):
|
||||||
# Z_mean = self.conf_region_clr.mean()
|
"""
|
||||||
# return self.clr.inverse(Z_mean)
|
Returns the point estimate, the center of the ellipse.
|
||||||
# the inverse of the CLR does not coincide with the clean mean because the geometric mean
|
|
||||||
# requires smoothing the prevalence vectors and this affects the softmax (inverse)
|
:return: np.ndarray of shape (n_classes,)
|
||||||
|
"""
|
||||||
|
# The inverse of the CLR does not coincide with the true mean, because the geometric mean
|
||||||
|
# requires smoothing the prevalence vectors and this affects the softmax (inverse);
|
||||||
|
# return self.clr.inverse(self.mean_) # <- does not coincide
|
||||||
return self.mean_
|
return self.mean_
|
||||||
|
|
||||||
def coverage(self, true_value):
|
def coverage(self, true_value):
|
||||||
"""
|
"""
|
||||||
true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
|
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
|
||||||
confidence_level None means that the confidence_level is taken from the __init__
|
fraction of these that are contained in the region, if more than one value is passed. If only one value is
|
||||||
returns true or false depending on whether true_value is in the ellipse or not,
|
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
|
||||||
or returns the proportion of true_values that are within the ellipse if more
|
|
||||||
than one are passed
|
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
|
||||||
|
:return: float in [0,1]
|
||||||
"""
|
"""
|
||||||
transformed_values = self.clr(true_value)
|
transformed_values = self.clr(true_value)
|
||||||
return self.conf_region_clr.coverage(transformed_values)
|
return self.conf_region_clr.coverage(transformed_values)
|
||||||
|
|
||||||
|
|
||||||
class ConfidenceIntervals(ConfidenceRegionABC):
|
class ConfidenceIntervals(ConfidenceRegionABC):
|
||||||
|
"""
|
||||||
|
Instantiates a region based on (independent) Confidence Intervals.
|
||||||
|
|
||||||
|
:param X: np.ndarray of shape (n_bootstrap_samples, n_classes)
|
||||||
|
:param confidence_level: float, the confidence level (default 0.95)
|
||||||
|
"""
|
||||||
def __init__(self, X, confidence_level=0.95):
|
def __init__(self, X, confidence_level=0.95):
|
||||||
assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)'
|
assert 0 < confidence_level < 1, f'{confidence_level=} must be in range(0,1)'
|
||||||
|
|
||||||
|
|
@ -158,14 +242,21 @@ class ConfidenceIntervals(ConfidenceRegionABC):
|
||||||
self.I_low, self.I_high = np.percentile(X, q=[2.5, 97.5], axis=0)
|
self.I_low, self.I_high = np.percentile(X, q=[2.5, 97.5], axis=0)
|
||||||
|
|
||||||
def point_estimate(self):
|
def point_estimate(self):
|
||||||
|
"""
|
||||||
|
Returns the point estimate, the class-wise average of the bootstrapped estimates
|
||||||
|
|
||||||
|
:return: np.ndarray of shape (n_classes,)
|
||||||
|
"""
|
||||||
return self.means_
|
return self.means_
|
||||||
|
|
||||||
def coverage(self, true_value):
|
def coverage(self, true_value):
|
||||||
"""
|
"""
|
||||||
true_value can be an array (n_dimensions,) or a matrix (n_vectors, n_dimensions,)
|
Checks whether a value, or a sets of values, are contained in the confidence region. The method computes the
|
||||||
returns true or false depending on whether true_value is in the ellipse or not,
|
fraction of these that are contained in the region, if more than one value is passed. If only one value is
|
||||||
or returns the proportion of true_values that are within the ellipse if more
|
passed, then it either returns 1.0 or 0.0, for indicating the value is in the region or not, respectively.
|
||||||
than one are passed
|
|
||||||
|
:param true_value: a np.ndarray of shape (n_classes,) or shape (n_values, n_classes,)
|
||||||
|
:return: float in [0,1]
|
||||||
"""
|
"""
|
||||||
within_intervals = np.logical_and(self.I_low <= true_value, true_value <= self.I_high)
|
within_intervals = np.logical_and(self.I_low <= true_value, true_value <= self.I_high)
|
||||||
within_all_intervals = np.all(within_intervals, axis=-1, keepdims=True)
|
within_all_intervals = np.all(within_intervals, axis=-1, keepdims=True)
|
||||||
|
|
@ -176,20 +267,56 @@ class ConfidenceIntervals(ConfidenceRegionABC):
|
||||||
|
|
||||||
class CLRtransformation:
|
class CLRtransformation:
|
||||||
"""
|
"""
|
||||||
Centered log-ratio
|
Centered log-ratio, from component analysis
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __call__(self, X, epsilon=1e-6):
|
def __call__(self, X, epsilon=1e-6):
|
||||||
|
"""
|
||||||
|
Applies the CLR function to X thus mapping the instances, which are contained in `\\mathcal{R}^{n}` but
|
||||||
|
actually lie on a `\\mathcal{R}^{n-1}` simplex, onto an unrestricted space in :math:`\\mathcal{R}^{n}`
|
||||||
|
|
||||||
|
:param X: np.ndarray of (n_instances, n_dimensions) to be transformed
|
||||||
|
:param epsilon: small float for prevalence smoothing
|
||||||
|
:return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
|
||||||
|
"""
|
||||||
X = np.asarray(X)
|
X = np.asarray(X)
|
||||||
X = qp.error.smooth(X, epsilon)
|
X = qp.error.smooth(X, epsilon)
|
||||||
G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean
|
G = np.exp(np.mean(np.log(X), axis=-1, keepdims=True)) # geometric mean
|
||||||
return np.log(X / G)
|
return np.log(X / G)
|
||||||
|
|
||||||
def inverse(self, X):
|
def inverse(self, X):
|
||||||
|
"""
|
||||||
|
Inverse function. However, clr.inverse(clr(X)) does not exactly coincide with X due to smoothing.
|
||||||
|
|
||||||
|
:param X: np.ndarray of (n_instances, n_dimensions) to be transformed
|
||||||
|
:return: np.ndarray of (n_instances, n_dimensions), the CLR-transformed points
|
||||||
|
"""
|
||||||
return softmax(X, axis=-1)
|
return softmax(X, axis=-1)
|
||||||
|
|
||||||
|
|
||||||
class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
||||||
|
"""
|
||||||
|
Aggregative Bootstrap allows any AggregativeQuantifier to get confidence regions around
|
||||||
|
point-estimates of class prevalence values. This method implements some optimizations for
|
||||||
|
speeding up the computations, which are only possible due to the two phases of the aggregative
|
||||||
|
quantifiers.
|
||||||
|
|
||||||
|
During training, the bootstrap repetitions are only carried out over pre-classified training instances,
|
||||||
|
after the classifier has been trained (only once), in order to train a series of aggregation
|
||||||
|
functions (model-based approach).
|
||||||
|
|
||||||
|
During inference, the bootstrap repetitions are applied to the pre-classified test instances.
|
||||||
|
|
||||||
|
:param quantifier: an aggregative quantifier
|
||||||
|
:para n_train_samples: int, the number of training resamplings (defaults to 1, set to > 1 to activate a
|
||||||
|
model-based bootstrap approach)
|
||||||
|
:para n_test_samples: int, the number of test resamplings (defaults to 500, set to > 1 to activate a
|
||||||
|
population-based bootstrap approach)
|
||||||
|
:param confidence_level: float, the confidence level for the confidence region (default 0.95)
|
||||||
|
:param method: string, set to `intervals` for constructing confidence intervals (default), or to
|
||||||
|
`ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for
|
||||||
|
constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space.
|
||||||
|
:param random_state: int for replicating samples, None (default) for non-replicable samples
|
||||||
|
"""
|
||||||
|
|
||||||
METHODS = ['intervals', 'ellipse', 'ellipse-clr']
|
METHODS = ['intervals', 'ellipse', 'ellipse-clr']
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue