1
0
Fork 0
QuaPy/quapy/functional.py

123 lines
4.7 KiB
Python

from collections import defaultdict
import numpy as np
import itertools
def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, return_constrained_dim=False):
s = np.linspace(0., 1., n_prevalences, endpoint=True)
s = [s] * (dimensions - 1)
prevs = [p for p in itertools.product(*s, repeat=1) if sum(p)<=1]
if return_constrained_dim:
prevs = [p+(1-sum(p),) for p in prevs]
prevs = np.asarray(prevs).reshape(len(prevs), -1)
if repeat>1:
prevs = np.repeat(prevs, repeat, axis=0)
return prevs
def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
"""
Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
and with the limits smoothed, i.e.:
[0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99]
:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
:param repeat: number of times each prevalence is to be repeated (defaults to 1)
:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
:return: an array of uniformly separated prevalence values
"""
p = np.linspace(0., 1., num=n_prevalences, endpoint=True)
p[0] += smooth_limits_epsilon
p[-1] -= smooth_limits_epsilon
if p[0] > p[1]:
raise ValueError(f'the smoothing in the limits is greater than the prevalence step')
if repeat > 1:
p = np.repeat(p, repeat)
return p
def prevalence_from_labels(labels, n_classes):
unique, counts = np.unique(labels, return_counts=True)
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float)
prevalences /= prevalences.sum()
return prevalences
def prevalence_from_probabilities(posteriors, binarize: bool = False):
if binarize:
predictions = np.argmax(posteriors, axis=-1)
return prevalence_from_labels(predictions, n_classes=posteriors.shape[1])
else:
prevalences = posteriors.mean(axis=0)
prevalences /= prevalences.sum()
return prevalences
def strprev(prevalences, prec=3):
return '['+ ', '.join([f'{p:.{prec}f}' for p in prevalences]) + ']'
def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True):
den = tpr - fpr
if den == 0:
den += 1e-8
adjusted = (prevalence_estim - fpr) / den
if clip:
adjusted = np.clip(adjusted, 0., 1.)
return adjusted
def normalize_prevalence(prevalences):
assert prevalences.ndim==1, 'unexpected shape'
accum = prevalences.sum()
if accum > 0:
return prevalences / accum
else:
# if all classifiers are trivial rejectors
return np.ones_like(prevalences) / prevalences.size
def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
"""
Computes the number of prevalence combinations in the nclasses-dimensional simplex if nprevpoints equally distant
prevalences are generated and nrepeats repetitions are requested
:param nclasses: number of classes
:param nprevpoints: number of prevalence points.
:param nrepeats: number of repetitions for each prevalence combination
:return: The number of possible combinations. For example, if nclasses=2, nprevpoints=5, nrepeats=1, then the number
of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
"""
__cache={}
def __f(nc,np):
if (nc,np) in __cache:
return __cache[(nc,np)]
if nc==1:
return 1
else:
x = sum([__f(nc-1, np-i) for i in range(np)])
__cache[(nc,np)] = x
return x
return __f(nclasses, nprevpoints) * nrepeats
def get_nprevpoints_approximation(nclasses, nrepeats, combinations_budget):
"""
Searches for the largest number of (equidistant) prevalence points to define for each of the nclasses classe so that
the number of valid prevalences generated as combinations of prevalence points (points in a nclasses-dimensional
simplex) do not exceed combinations_budget.
:param nclasses: number of classes
:param nrepeats: number of repetitions for each prevalence combination
:param combinations_budget: maximum number of combinatios allowed
:return: the largest number of prevalence points that generate less than combinations_budget valid prevalences
"""
assert nclasses>0 and nrepeats>0 and combinations_budget>0, 'parameters must be positive integers'
nprevpoints = 1
while True:
combinations = num_prevalence_combinations(nclasses, nprevpoints, nrepeats)
if combinations > combinations_budget:
return nprevpoints-1
else:
nprevpoints+=1