added DistributionMatching method, a generic model for distribution matching for multiclass quantification problems that takes the divergence and number of bins as hyperparameters

This commit is contained in:
Alejandro Moreo Fernandez 2023-01-31 15:08:58 +01:00
parent f9a199d859
commit ceb88792c5
2 changed files with 136 additions and 38 deletions

View File

@ -25,9 +25,9 @@
- cross_val_predict (for quantification) added to model_selection: would be nice to allow the user specifies a
test protocol maybe, or None for bypassing it?
- I think Pablo added DyS, Topsoe distance and binary search.
- DyS, Topsoe distance and binary search (thanks to Pablo González)
- I think Pablo added multi-thread reproducibility.
- Multi-thread reproducibility via seeding (thanks to Pablo González)
- Bugfix: adding two labelled collections (with +) now checks for consistency in the classes
@ -40,8 +40,16 @@
- the internal classifier of aggregative methods is now called "classifier" instead of "learner"
- when optimizing the hyperparameters of an aggregative quantifier, the classifier's specific hyperparameters
should be marked with a "classifier__" prefix (just like in scikit-learn), while the quantifier's specific
hyperparameters are named directly. For example, PCC(LogisticRegression()) quantifier has
should be marked with a "classifier__" prefix (just like in scikit-learn with estimators), while the quantifier's
specific hyperparameters are named directly. For example, PCC(LogisticRegression()) quantifier has hyperparameters
"classifier__C", "classifier__class_weight", etc., instead of "C" and "class_weight" as in v0.1.6.
- hyperparameters yielding to inconsistent runs raise a ValueError exception, while hyperparameter combinations
yielding to internal errors of surrogate functions are reported and skipped, without stopping the grid search.
- DistributionMatching methods added. This is a general framework for distribution matching methods that catters for
multiclass quantification. That is to say, one could get a multiclass variant of the (originally binary) HDy
method aligned with the Firat's formulation.
Things to fix:
- calibration with recalibration methods has to be fixed for exact_train_prev in EMQ (conflicts with clone, deepcopy, etc.)

View File

@ -3,6 +3,7 @@ from copy import deepcopy
from typing import Callable, Union
import numpy as np
from joblib import Parallel, delayed
from scipy import optimize
from sklearn.base import BaseEstimator, clone
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
@ -10,8 +11,7 @@ from sklearn.model_selection import StratifiedKFold, cross_val_predict
from tqdm import tqdm
import quapy as qp
import quapy.functional as F
from classification.calibration import RecalibratedProbabilisticClassifier, NBVSCalibration, BCTSCalibration, TSCalibration, \
VSCalibration
from classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier, BinaryQuantifier
@ -91,25 +91,6 @@ class AggregativeQuantifier(BaseQuantifier):
"""
...
# def get_params(self, deep=True):
# """
# Return the current parameters of the quantifier.
#
# :param deep: for compatibility with sklearn
# :return: a dictionary of param-value pairs
# """
#
# return self.learner.get_params()
# def set_params(self, **parameters):
# """
# Set the parameters of the quantifier.
#
# :param parameters: dictionary of param-value pairs
# """
#
# self.learner.set_params(**parameters)
@property
def classes_(self):
"""
@ -690,16 +671,16 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
:param n_bins: an int with the number of bins to use to compute the histograms.
:param distance: an str with a distance already included in the librar (HD or topsoe), of a function
that computes the distance between two distributions.
:param divergence: a str indicating the name of divergence (currently supported ones are "HD" or "topsoe"), or a
callable function computes the divergence between two distributions (two equally sized arrays).
:param tol: a float with the tolerance for the ternary search algorithm.
"""
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_bins=8, distance: Union[str, Callable]='HD', tol=1e-05):
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05):
self.classifier = classifier
self.val_split = val_split
self.tol = tol
self.distance = distance
self.divergence = divergence
self.n_bins = n_bins
def _ternary_search(self, f, left, right, tol):
@ -718,14 +699,6 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
# Left and right are the current bounds; the maximum is between them
return (left + right) / 2
def _compute_distance(self, Px_train, Px_test, distance: Union[str, Callable]='HD'):
if distance == 'HD':
return F.HellingerDistance(Px_train, Px_test)
elif distance == 'topsoe':
return F.TopsoeDistance(Px_train, Px_test)
else:
return distance(Px_train, Px_test)
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
if val_split is None:
val_split = self.val_split
@ -744,10 +717,11 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
divergence = _get_divergence(self.divergence)
def distribution_distance(prev):
Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
return self._compute_distance(Px_train,Px_test,self.distance)
return divergence(Px_train, Px_test)
class1_prev = self._ternary_search(f=distribution_distance, left=0, right=1, tol=self.tol)
return np.asarray([1 - class1_prev, class1_prev])
@ -791,6 +765,122 @@ class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
return np.asarray([1 - class1_prev, class1_prev])
def _get_divergence(divergence: Union[str, Callable]):
if isinstance(divergence, str):
if divergence=='HD':
return F.HellingerDistance
elif divergence=='topsoe':
return F.TopsoeDistance
else:
raise ValueError(f'unknown divergence {divergence}')
elif callable(divergence):
return divergence
else:
raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
class DistributionMatching(AggregativeProbabilisticQuantifier):
"""
Generic Distribution Matching quantifier for binary or multiclass quantification.
This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
:param classifier: a sklearn's Estimator that generates a probabilistic classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the
validation distribution.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
validation data, or as an integer, indicating that the validation distribution should be estimated via
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
:param nbins: number of bins used to discretize the distributions (default 8)
:param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
Distance)
:param cdf: whether or not to use CDF instead of PDF (default False)
:param n_jobs: number of parallel workers (default None)
"""
def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, n_jobs=None):
self.classifier = classifier
self.val_split = val_split
self.nbins = nbins
self.divergence = divergence
self.cdf = cdf
self.n_jobs = n_jobs
def __get_distributions(self, posteriors):
histograms = []
post_dims = posteriors.shape[1]
if post_dims == 2:
# in binary quantification we can use only one class, since the other one is its complement
post_dims = 1
for dim in range(post_dims):
hist = np.histogram(posteriors[:, dim], bins=self.nbins, range=(0, 1))[0]
histograms.append(hist)
counts = np.vstack(histograms)
distributions = counts/counts.sum(axis=1)[:,np.newaxis]
if self.cdf:
distributions = np.cumsum(distributions, axis=1)
return distributions
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
"""
Trains the classifier (if requested) and generates the validation distributions out of the training data.
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
is the fraction of instances with a value in the `k`-th bin.
:param data: the training set
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
to estimate the parameters
"""
if val_split is None:
val_split = self.val_split
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
self.validation_distribution = np.asarray(
[self.__get_distributions(posteriors[y==cat]) for cat in range(data.n_classes)]
)
return self
def aggregate(self, posteriors: np.ndarray):
"""
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
In the multiclass case, with `n` the number of classes, the test and mixture distributions contain
`n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
independently. The matching is computed as an average of the divergence across all channels.
:param instances: instances in the sample
:return: a vector of class prevalence estimates
"""
test_distribution = self.__get_distributions(posteriors)
divergence = _get_divergence(self.divergence)
n_classes, n_channels, nbins = self.validation_distribution.shape
def match(prev):
prev = np.expand_dims(prev, axis=0)
mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1)
divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)]
return np.mean(divs)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
class ELM(AggregativeQuantifier, BinaryQuantifier):
"""