forked from moreo/QuaPy
adding DistributionMatchingX, the covariate-specific equivalent counterpart of DistributionMatching
This commit is contained in:
parent
76cf784844
commit
c3cf0e2d49
|
@ -21,7 +21,7 @@ See <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ for
|
||||||
qp.environ['SAMPLE_SIZE']=100
|
qp.environ['SAMPLE_SIZE']=100
|
||||||
|
|
||||||
|
|
||||||
df = pd.DataFrame(columns=('method', 'dataset', 'MAE', 'MRAE', 'tr-time', 'te-time'))
|
df = pd.DataFrame(columns=['method', 'dataset', 'MAE', 'MRAE', 'tr-time', 'te-time'])
|
||||||
|
|
||||||
|
|
||||||
for dataset_name in tqdm(qp.datasets.UCI_DATASETS, total=len(qp.datasets.UCI_DATASETS)):
|
for dataset_name in tqdm(qp.datasets.UCI_DATASETS, total=len(qp.datasets.UCI_DATASETS)):
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
import itertools
|
import itertools
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from typing import Union, Callable
|
||||||
|
|
||||||
import scipy
|
import scipy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
@ -276,3 +278,16 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def get_divergence(divergence: Union[str, Callable]):
|
||||||
|
if isinstance(divergence, str):
|
||||||
|
if divergence=='HD':
|
||||||
|
return HellingerDistance
|
||||||
|
elif divergence=='topsoe':
|
||||||
|
return TopsoeDistance
|
||||||
|
else:
|
||||||
|
raise ValueError(f'unknown divergence {divergence}')
|
||||||
|
elif callable(divergence):
|
||||||
|
return divergence
|
||||||
|
else:
|
||||||
|
raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
|
||||||
|
|
|
@ -9,6 +9,7 @@ from sklearn.metrics import confusion_matrix
|
||||||
from sklearn.model_selection import cross_val_predict
|
from sklearn.model_selection import cross_val_predict
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
|
from functional import get_divergence
|
||||||
from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
|
from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
|
||||||
from quapy.classification.svmperf import SVMperf
|
from quapy.classification.svmperf import SVMperf
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
|
@ -605,20 +606,6 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
return np.asarray([1 - class1_prev, class1_prev])
|
return np.asarray([1 - class1_prev, class1_prev])
|
||||||
|
|
||||||
|
|
||||||
def _get_divergence(divergence: Union[str, Callable]):
|
|
||||||
if isinstance(divergence, str):
|
|
||||||
if divergence=='HD':
|
|
||||||
return F.HellingerDistance
|
|
||||||
elif divergence=='topsoe':
|
|
||||||
return F.TopsoeDistance
|
|
||||||
else:
|
|
||||||
raise ValueError(f'unknown divergence {divergence}')
|
|
||||||
elif callable(divergence):
|
|
||||||
return divergence
|
|
||||||
else:
|
|
||||||
raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
|
|
||||||
|
|
||||||
|
|
||||||
class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
"""
|
"""
|
||||||
`DyS framework <https://ojs.aaai.org/index.php/AAAI/article/view/4376>`_ (DyS).
|
`DyS framework <https://ojs.aaai.org/index.php/AAAI/article/view/4376>`_ (DyS).
|
||||||
|
@ -676,7 +663,7 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
|
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
|
||||||
|
|
||||||
Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
|
Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
|
||||||
divergence = _get_divergence(self.divergence)
|
divergence = get_divergence(self.divergence)
|
||||||
|
|
||||||
def distribution_distance(prev):
|
def distribution_distance(prev):
|
||||||
Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
|
Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
|
||||||
|
@ -727,8 +714,9 @@ class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
|
|
||||||
class DistributionMatching(AggregativeProbabilisticQuantifier):
|
class DistributionMatching(AggregativeProbabilisticQuantifier):
|
||||||
"""
|
"""
|
||||||
Generic Distribution Matching quantifier for binary or multiclass quantification.
|
Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of posterior
|
||||||
This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
|
probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF
|
||||||
|
as hyperparameters.
|
||||||
|
|
||||||
:param classifier: a `sklearn`'s Estimator that generates a probabilistic classifier
|
:param classifier: a `sklearn`'s Estimator that generates a probabilistic classifier
|
||||||
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the
|
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the
|
||||||
|
@ -741,7 +729,7 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
|
||||||
:param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
|
:param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
|
||||||
or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
|
or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
|
||||||
Distance)
|
Distance)
|
||||||
:param cdf: whether or not to use CDF instead of PDF (default False)
|
:param cdf: whether to use CDF instead of PDF (default False)
|
||||||
:param n_jobs: number of parallel workers (default None)
|
:param n_jobs: number of parallel workers (default None)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -773,8 +761,8 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
|
||||||
"""
|
"""
|
||||||
Trains the classifier (if requested) and generates the validation distributions out of the training data.
|
Trains the classifier (if requested) and generates the validation distributions out of the training data.
|
||||||
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
|
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
|
||||||
channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
|
channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; then `di=V[i]`
|
||||||
are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
|
are the distributions obtained from training data labelled with class `i`; while `dij = di[j]` is the discrete
|
||||||
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
|
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
|
||||||
is the fraction of instances with a value in the `k`-th bin.
|
is the fraction of instances with a value in the `k`-th bin.
|
||||||
|
|
||||||
|
@ -810,7 +798,7 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
|
||||||
:return: a vector of class prevalence estimates
|
:return: a vector of class prevalence estimates
|
||||||
"""
|
"""
|
||||||
test_distribution = self.__get_distributions(posteriors)
|
test_distribution = self.__get_distributions(posteriors)
|
||||||
divergence = _get_divergence(self.divergence)
|
divergence = get_divergence(self.divergence)
|
||||||
n_classes, n_channels, nbins = self.validation_distribution.shape
|
n_classes, n_channels, nbins = self.validation_distribution.shape
|
||||||
def match(prev):
|
def match(prev):
|
||||||
prev = np.expand_dims(prev, axis=0)
|
prev = np.expand_dims(prev, axis=0)
|
||||||
|
|
|
@ -1,8 +1,14 @@
|
||||||
|
from typing import Union, Callable
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from scipy import optimize
|
||||||
|
|
||||||
|
from functional import get_divergence
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
|
|
||||||
|
|
||||||
class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
||||||
"""
|
"""
|
||||||
The `Maximum Likelihood Prevalence Estimation` (MLPE) method is a lazy method that assumes there is no prior
|
The `Maximum Likelihood Prevalence Estimation` (MLPE) method is a lazy method that assumes there is no prior
|
||||||
|
@ -35,6 +41,7 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
||||||
return self.estimated_prevalence
|
return self.estimated_prevalence
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class HDx(BinaryQuantifier):
|
class HDx(BinaryQuantifier):
|
||||||
"""
|
"""
|
||||||
`Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx).
|
`Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx).
|
||||||
|
@ -49,19 +56,11 @@ class HDx(BinaryQuantifier):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.feat_ranges = None
|
self.feat_ranges = None
|
||||||
|
|
||||||
def get_features_range(self, X):
|
|
||||||
feat_ranges = []
|
|
||||||
ncols = X.shape[1]
|
|
||||||
for col_idx in range(ncols):
|
|
||||||
feature = X[:,col_idx]
|
|
||||||
feat_ranges.append((np.min(feature), np.max(feature)))
|
|
||||||
return feat_ranges
|
|
||||||
|
|
||||||
def covariate_histograms(self, X, nbins):
|
def covariate_histograms(self, X, nbins):
|
||||||
assert self.feat_ranges is not None, 'quantify called before fit'
|
assert self.feat_ranges is not None, 'quantify called before fit'
|
||||||
|
|
||||||
histograms = []
|
histograms = []
|
||||||
for col_idx in range(self.ncols):
|
for col_idx in range(self.nfeats):
|
||||||
feature = X[:,col_idx]
|
feature = X[:,col_idx]
|
||||||
feat_range = self.feat_ranges[col_idx]
|
feat_range = self.feat_ranges[col_idx]
|
||||||
histograms.append(np.histogram(feature, bins=nbins, range=feat_range, density=True)[0])
|
histograms.append(np.histogram(feature, bins=nbins, range=feat_range, density=True)[0])
|
||||||
|
@ -79,8 +78,8 @@ class HDx(BinaryQuantifier):
|
||||||
self._check_binary(data, self.__class__.__name__)
|
self._check_binary(data, self.__class__.__name__)
|
||||||
X, y = data.Xy
|
X, y = data.Xy
|
||||||
|
|
||||||
self.ncols = X.shape[1]
|
self.nfeats = X.shape[1]
|
||||||
self.feat_ranges = self.get_features_range(X)
|
self.feat_ranges = _get_features_range(X)
|
||||||
|
|
||||||
# pre-compute the representation for positive and negative examples
|
# pre-compute the representation for positive and negative examples
|
||||||
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
|
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
|
||||||
|
@ -93,7 +92,7 @@ class HDx(BinaryQuantifier):
|
||||||
# and the final estimated a priori probability was taken as the median of these 11 estimates."
|
# and the final estimated a priori probability was taken as the median of these 11 estimates."
|
||||||
# (González-Castro, et al., 2013).
|
# (González-Castro, et al., 2013).
|
||||||
|
|
||||||
assert X.shape[1] == self.ncols, f'wrong shape in quantify; expected {self.ncols}, found {X.shape[1]}'
|
assert X.shape[1] == self.nfeats, f'wrong shape in quantify; expected {self.nfeats}, found {X.shape[1]}'
|
||||||
|
|
||||||
prev_estimations = []
|
prev_estimations = []
|
||||||
for nbins in self.bins:
|
for nbins in self.bins:
|
||||||
|
@ -106,11 +105,104 @@ class HDx(BinaryQuantifier):
|
||||||
prev_selected, min_dist = None, None
|
prev_selected, min_dist = None, None
|
||||||
for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
|
for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
|
||||||
Hx = prev * H1 + (1 - prev) * H0
|
Hx = prev * H1 + (1 - prev) * H0
|
||||||
hdx = np.mean([F.HellingerDistance(Hx[:,col], Ht[:,col]) for col in range(self.ncols)])
|
hdx = np.mean([F.HellingerDistance(Hx[:,col], Ht[:,col]) for col in range(self.nfeats)])
|
||||||
|
|
||||||
if prev_selected is None or hdx < min_dist:
|
if prev_selected is None or hdx < min_dist:
|
||||||
prev_selected, min_dist = prev, hdx
|
prev_selected, min_dist = prev, hdx
|
||||||
prev_estimations.append(prev_selected)
|
prev_estimations.append(prev_selected)
|
||||||
|
|
||||||
class1_prev = np.median(prev_estimations)
|
class1_prev = np.median(prev_estimations)
|
||||||
return np.asarray([1 - class1_prev, class1_prev])
|
return np.asarray([1 - class1_prev, class1_prev])
|
||||||
|
|
||||||
|
|
||||||
|
class DistributionMatchingX(BaseQuantifier):
|
||||||
|
"""
|
||||||
|
Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of covariates.
|
||||||
|
This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
|
||||||
|
|
||||||
|
:param nbins: number of bins used to discretize the distributions (default 8)
|
||||||
|
:param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
|
||||||
|
or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
|
||||||
|
Distance)
|
||||||
|
:param cdf: whether to use CDF instead of PDF (default False)
|
||||||
|
:param n_jobs: number of parallel workers (default None)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, n_jobs=None):
|
||||||
|
self.nbins = nbins
|
||||||
|
self.divergence = divergence
|
||||||
|
self.cdf = cdf
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
|
||||||
|
def __get_distributions(self, X):
|
||||||
|
histograms = []
|
||||||
|
for feat_idx in range(self.nfeats):
|
||||||
|
hist = np.histogram(X[:, feat_idx], bins=self.nbins, density=True, range=self.feat_ranges[feat_idx])[0]
|
||||||
|
histograms.append(hist)
|
||||||
|
|
||||||
|
distributions = np.vstack(histograms)
|
||||||
|
if self.cdf:
|
||||||
|
distributions = np.cumsum(distributions, axis=1)
|
||||||
|
return distributions
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection):
|
||||||
|
"""
|
||||||
|
Generates the validation distributions out of the training data (covariates).
|
||||||
|
The validation distributions have shape `(n, nfeats, nbins)`, with `n` the number of classes, `nfeats`
|
||||||
|
the number of features, and `nbins` the number of bins.
|
||||||
|
In particular, let `V` be the validation distributions; then `di=V[i]` are the distributions obtained from
|
||||||
|
training data labelled with class `i`; while `dij = di[j]` is the discrete distribution for feature j in
|
||||||
|
training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin.
|
||||||
|
|
||||||
|
:param data: the training set
|
||||||
|
"""
|
||||||
|
X, y = data.Xy
|
||||||
|
|
||||||
|
self.nfeats = X.shape[1]
|
||||||
|
self.feat_ranges = _get_features_range(X)
|
||||||
|
|
||||||
|
self.validation_distribution = np.asarray(
|
||||||
|
[self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)]
|
||||||
|
)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
"""
|
||||||
|
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
|
||||||
|
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
|
||||||
|
The matching is computed as the average dissimilarity (in terms of the dissimilarity measure of choice)
|
||||||
|
between all feature-specific discrete distributions.
|
||||||
|
|
||||||
|
:param instances: instances in the sample
|
||||||
|
:return: a vector of class prevalence estimates
|
||||||
|
"""
|
||||||
|
|
||||||
|
assert instances.shape[1] == self.nfeats, f'wrong shape; expected {self.nfeats}, found {instances.shape[1]}'
|
||||||
|
|
||||||
|
test_distribution = self.__get_distributions(instances)
|
||||||
|
divergence = get_divergence(self.divergence)
|
||||||
|
n_classes, n_feats, nbins = self.validation_distribution.shape
|
||||||
|
def match(prev):
|
||||||
|
prev = np.expand_dims(prev, axis=0)
|
||||||
|
mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_feats, -1)
|
||||||
|
divs = [divergence(test_distribution[feat], mixture_distribution[feat]) for feat in range(n_feats)]
|
||||||
|
return np.mean(divs)
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
return r.x
|
||||||
|
|
||||||
|
|
||||||
|
def _get_features_range(X):
|
||||||
|
feat_ranges = []
|
||||||
|
ncols = X.shape[1]
|
||||||
|
for col_idx in range(ncols):
|
||||||
|
feature = X[:,col_idx]
|
||||||
|
feat_ranges.append((np.min(feature), np.max(feature)))
|
||||||
|
return feat_ranges
|
|
@ -236,7 +236,7 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Abort: the number of samples that will be generated by {self.__class__.__name__} ({n}) "
|
f"Abort: the number of samples that will be generated by {self.__class__.__name__} ({n}) "
|
||||||
f"exceeds the maximum number of allowed samples ({sanity_check = }). Set 'sanity_check' to "
|
f"exceeds the maximum number of allowed samples ({sanity_check = }). Set 'sanity_check' to "
|
||||||
f"None for bypassing this check, or to a higher number.")
|
f"None, or to a higher number, for bypassing this check.")
|
||||||
|
|
||||||
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue