added DistributionMatching method, a generic model for distribution matching for multiclass quantification problems that takes the divergence and number of bins as hyperparameters
This commit is contained in:
parent
f9a199d859
commit
ceb88792c5
|
@ -25,9 +25,9 @@
|
||||||
- cross_val_predict (for quantification) added to model_selection: would be nice to allow the user specifies a
|
- cross_val_predict (for quantification) added to model_selection: would be nice to allow the user specifies a
|
||||||
test protocol maybe, or None for bypassing it?
|
test protocol maybe, or None for bypassing it?
|
||||||
|
|
||||||
- I think Pablo added DyS, Topsoe distance and binary search.
|
- DyS, Topsoe distance and binary search (thanks to Pablo González)
|
||||||
|
|
||||||
- I think Pablo added multi-thread reproducibility.
|
- Multi-thread reproducibility via seeding (thanks to Pablo González)
|
||||||
|
|
||||||
- Bugfix: adding two labelled collections (with +) now checks for consistency in the classes
|
- Bugfix: adding two labelled collections (with +) now checks for consistency in the classes
|
||||||
|
|
||||||
|
@ -40,8 +40,16 @@
|
||||||
- the internal classifier of aggregative methods is now called "classifier" instead of "learner"
|
- the internal classifier of aggregative methods is now called "classifier" instead of "learner"
|
||||||
|
|
||||||
- when optimizing the hyperparameters of an aggregative quantifier, the classifier's specific hyperparameters
|
- when optimizing the hyperparameters of an aggregative quantifier, the classifier's specific hyperparameters
|
||||||
should be marked with a "classifier__" prefix (just like in scikit-learn), while the quantifier's specific
|
should be marked with a "classifier__" prefix (just like in scikit-learn with estimators), while the quantifier's
|
||||||
hyperparameters are named directly. For example, PCC(LogisticRegression()) quantifier has
|
specific hyperparameters are named directly. For example, PCC(LogisticRegression()) quantifier has hyperparameters
|
||||||
|
"classifier__C", "classifier__class_weight", etc., instead of "C" and "class_weight" as in v0.1.6.
|
||||||
|
|
||||||
|
- hyperparameters yielding to inconsistent runs raise a ValueError exception, while hyperparameter combinations
|
||||||
|
yielding to internal errors of surrogate functions are reported and skipped, without stopping the grid search.
|
||||||
|
|
||||||
|
- DistributionMatching methods added. This is a general framework for distribution matching methods that catters for
|
||||||
|
multiclass quantification. That is to say, one could get a multiclass variant of the (originally binary) HDy
|
||||||
|
method aligned with the Firat's formulation.
|
||||||
|
|
||||||
Things to fix:
|
Things to fix:
|
||||||
- calibration with recalibration methods has to be fixed for exact_train_prev in EMQ (conflicts with clone, deepcopy, etc.)
|
- calibration with recalibration methods has to be fixed for exact_train_prev in EMQ (conflicts with clone, deepcopy, etc.)
|
||||||
|
|
|
@ -3,6 +3,7 @@ from copy import deepcopy
|
||||||
from typing import Callable, Union
|
from typing import Callable, Union
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
|
from scipy import optimize
|
||||||
from sklearn.base import BaseEstimator, clone
|
from sklearn.base import BaseEstimator, clone
|
||||||
from sklearn.calibration import CalibratedClassifierCV
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
from sklearn.metrics import confusion_matrix
|
from sklearn.metrics import confusion_matrix
|
||||||
|
@ -10,8 +11,7 @@ from sklearn.model_selection import StratifiedKFold, cross_val_predict
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
from classification.calibration import RecalibratedProbabilisticClassifier, NBVSCalibration, BCTSCalibration, TSCalibration, \
|
from classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
|
||||||
VSCalibration
|
|
||||||
from quapy.classification.svmperf import SVMperf
|
from quapy.classification.svmperf import SVMperf
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
||||||
|
@ -91,25 +91,6 @@ class AggregativeQuantifier(BaseQuantifier):
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
# def get_params(self, deep=True):
|
|
||||||
# """
|
|
||||||
# Return the current parameters of the quantifier.
|
|
||||||
#
|
|
||||||
# :param deep: for compatibility with sklearn
|
|
||||||
# :return: a dictionary of param-value pairs
|
|
||||||
# """
|
|
||||||
#
|
|
||||||
# return self.learner.get_params()
|
|
||||||
|
|
||||||
# def set_params(self, **parameters):
|
|
||||||
# """
|
|
||||||
# Set the parameters of the quantifier.
|
|
||||||
#
|
|
||||||
# :param parameters: dictionary of param-value pairs
|
|
||||||
# """
|
|
||||||
#
|
|
||||||
# self.learner.set_params(**parameters)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def classes_(self):
|
def classes_(self):
|
||||||
"""
|
"""
|
||||||
|
@ -690,16 +671,16 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
|
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
|
||||||
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
|
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
|
||||||
:param n_bins: an int with the number of bins to use to compute the histograms.
|
:param n_bins: an int with the number of bins to use to compute the histograms.
|
||||||
:param distance: an str with a distance already included in the librar (HD or topsoe), of a function
|
:param divergence: a str indicating the name of divergence (currently supported ones are "HD" or "topsoe"), or a
|
||||||
that computes the distance between two distributions.
|
callable function computes the divergence between two distributions (two equally sized arrays).
|
||||||
:param tol: a float with the tolerance for the ternary search algorithm.
|
:param tol: a float with the tolerance for the ternary search algorithm.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_bins=8, distance: Union[str, Callable]='HD', tol=1e-05):
|
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05):
|
||||||
self.classifier = classifier
|
self.classifier = classifier
|
||||||
self.val_split = val_split
|
self.val_split = val_split
|
||||||
self.tol = tol
|
self.tol = tol
|
||||||
self.distance = distance
|
self.divergence = divergence
|
||||||
self.n_bins = n_bins
|
self.n_bins = n_bins
|
||||||
|
|
||||||
def _ternary_search(self, f, left, right, tol):
|
def _ternary_search(self, f, left, right, tol):
|
||||||
|
@ -718,14 +699,6 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
# Left and right are the current bounds; the maximum is between them
|
# Left and right are the current bounds; the maximum is between them
|
||||||
return (left + right) / 2
|
return (left + right) / 2
|
||||||
|
|
||||||
def _compute_distance(self, Px_train, Px_test, distance: Union[str, Callable]='HD'):
|
|
||||||
if distance == 'HD':
|
|
||||||
return F.HellingerDistance(Px_train, Px_test)
|
|
||||||
elif distance == 'topsoe':
|
|
||||||
return F.TopsoeDistance(Px_train, Px_test)
|
|
||||||
else:
|
|
||||||
return distance(Px_train, Px_test)
|
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
if val_split is None:
|
if val_split is None:
|
||||||
val_split = self.val_split
|
val_split = self.val_split
|
||||||
|
@ -744,10 +717,11 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
|
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
|
||||||
|
|
||||||
Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
|
Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
|
||||||
|
divergence = _get_divergence(self.divergence)
|
||||||
|
|
||||||
def distribution_distance(prev):
|
def distribution_distance(prev):
|
||||||
Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
|
Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
|
||||||
return self._compute_distance(Px_train,Px_test,self.distance)
|
return divergence(Px_train, Px_test)
|
||||||
|
|
||||||
class1_prev = self._ternary_search(f=distribution_distance, left=0, right=1, tol=self.tol)
|
class1_prev = self._ternary_search(f=distribution_distance, left=0, right=1, tol=self.tol)
|
||||||
return np.asarray([1 - class1_prev, class1_prev])
|
return np.asarray([1 - class1_prev, class1_prev])
|
||||||
|
@ -791,6 +765,122 @@ class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
|
|
||||||
return np.asarray([1 - class1_prev, class1_prev])
|
return np.asarray([1 - class1_prev, class1_prev])
|
||||||
|
|
||||||
|
def _get_divergence(divergence: Union[str, Callable]):
|
||||||
|
if isinstance(divergence, str):
|
||||||
|
if divergence=='HD':
|
||||||
|
return F.HellingerDistance
|
||||||
|
elif divergence=='topsoe':
|
||||||
|
return F.TopsoeDistance
|
||||||
|
else:
|
||||||
|
raise ValueError(f'unknown divergence {divergence}')
|
||||||
|
elif callable(divergence):
|
||||||
|
return divergence
|
||||||
|
else:
|
||||||
|
raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
|
||||||
|
|
||||||
|
class DistributionMatching(AggregativeProbabilisticQuantifier):
|
||||||
|
"""
|
||||||
|
Generic Distribution Matching quantifier for binary or multiclass quantification.
|
||||||
|
This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
|
||||||
|
|
||||||
|
:param classifier: a sklearn's Estimator that generates a probabilistic classifier
|
||||||
|
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the
|
||||||
|
validation distribution.
|
||||||
|
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
|
||||||
|
validation data, or as an integer, indicating that the validation distribution should be estimated via
|
||||||
|
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
|
||||||
|
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
||||||
|
:param nbins: number of bins used to discretize the distributions (default 8)
|
||||||
|
:param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
|
||||||
|
or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
|
||||||
|
Distance)
|
||||||
|
:param cdf: whether or not to use CDF instead of PDF (default False)
|
||||||
|
:param n_jobs: number of parallel workers (default None)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, n_jobs=None):
|
||||||
|
|
||||||
|
self.classifier = classifier
|
||||||
|
self.val_split = val_split
|
||||||
|
self.nbins = nbins
|
||||||
|
self.divergence = divergence
|
||||||
|
self.cdf = cdf
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
|
||||||
|
def __get_distributions(self, posteriors):
|
||||||
|
histograms = []
|
||||||
|
post_dims = posteriors.shape[1]
|
||||||
|
if post_dims == 2:
|
||||||
|
# in binary quantification we can use only one class, since the other one is its complement
|
||||||
|
post_dims = 1
|
||||||
|
for dim in range(post_dims):
|
||||||
|
hist = np.histogram(posteriors[:, dim], bins=self.nbins, range=(0, 1))[0]
|
||||||
|
histograms.append(hist)
|
||||||
|
|
||||||
|
counts = np.vstack(histograms)
|
||||||
|
distributions = counts/counts.sum(axis=1)[:,np.newaxis]
|
||||||
|
if self.cdf:
|
||||||
|
distributions = np.cumsum(distributions, axis=1)
|
||||||
|
return distributions
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
|
"""
|
||||||
|
Trains the classifier (if requested) and generates the validation distributions out of the training data.
|
||||||
|
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
|
||||||
|
channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
|
||||||
|
are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
|
||||||
|
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
|
||||||
|
is the fraction of instances with a value in the `k`-th bin.
|
||||||
|
|
||||||
|
:param data: the training set
|
||||||
|
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
|
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
||||||
|
to estimate the parameters
|
||||||
|
"""
|
||||||
|
if val_split is None:
|
||||||
|
val_split = self.val_split
|
||||||
|
|
||||||
|
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
|
||||||
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
self.validation_distribution = np.asarray(
|
||||||
|
[self.__get_distributions(posteriors[y==cat]) for cat in range(data.n_classes)]
|
||||||
|
)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
"""
|
||||||
|
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
|
||||||
|
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
|
||||||
|
In the multiclass case, with `n` the number of classes, the test and mixture distributions contain
|
||||||
|
`n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
|
||||||
|
independently. The matching is computed as an average of the divergence across all channels.
|
||||||
|
|
||||||
|
:param instances: instances in the sample
|
||||||
|
:return: a vector of class prevalence estimates
|
||||||
|
"""
|
||||||
|
test_distribution = self.__get_distributions(posteriors)
|
||||||
|
divergence = _get_divergence(self.divergence)
|
||||||
|
n_classes, n_channels, nbins = self.validation_distribution.shape
|
||||||
|
def match(prev):
|
||||||
|
prev = np.expand_dims(prev, axis=0)
|
||||||
|
mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1)
|
||||||
|
divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)]
|
||||||
|
return np.mean(divs)
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
return r.x
|
||||||
|
|
||||||
|
|
||||||
class ELM(AggregativeQuantifier, BinaryQuantifier):
|
class ELM(AggregativeQuantifier, BinaryQuantifier):
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue