2023-11-12 13:04:19 +01:00
|
|
|
|
from abc import ABC, abstractmethod
|
2021-01-15 18:32:32 +01:00
|
|
|
|
from copy import deepcopy
|
2022-07-11 12:21:49 +02:00
|
|
|
|
from typing import Callable, Union
|
2021-01-15 18:32:32 +01:00
|
|
|
|
import numpy as np
|
2023-11-15 10:55:13 +01:00
|
|
|
|
from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
|
2023-01-31 15:08:58 +01:00
|
|
|
|
from scipy import optimize
|
2023-02-14 19:15:59 +01:00
|
|
|
|
from sklearn.base import BaseEstimator
|
2021-01-15 18:32:32 +01:00
|
|
|
|
from sklearn.calibration import CalibratedClassifierCV
|
|
|
|
|
from sklearn.metrics import confusion_matrix
|
2023-02-14 19:15:59 +01:00
|
|
|
|
from sklearn.model_selection import cross_val_predict
|
2023-11-13 12:42:57 +01:00
|
|
|
|
|
2021-01-18 10:53:22 +01:00
|
|
|
|
import quapy as qp
|
2021-01-15 18:32:32 +01:00
|
|
|
|
import quapy.functional as F
|
2023-11-13 14:45:34 +01:00
|
|
|
|
from quapy.functional import get_divergence
|
2023-02-14 19:15:59 +01:00
|
|
|
|
from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
|
2021-01-15 18:32:32 +01:00
|
|
|
|
from quapy.classification.svmperf import SVMperf
|
|
|
|
|
from quapy.data import LabelledCollection
|
2023-02-09 19:39:16 +01:00
|
|
|
|
from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric
|
2024-03-15 16:24:45 +01:00
|
|
|
|
from quapy.method import _bayesian
|
|
|
|
|
|
2021-01-15 18:32:32 +01:00
|
|
|
|
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
# Abstract classes
|
|
|
|
|
# ------------------------------------
|
|
|
|
|
|
2023-11-13 12:42:57 +01:00
|
|
|
|
class AggregativeQuantifier(BaseQuantifier, ABC):
|
2020-12-03 18:12:28 +01:00
|
|
|
|
"""
|
|
|
|
|
Abstract class for quantification methods that base their estimations on the aggregation of classification
|
2023-11-12 13:04:19 +01:00
|
|
|
|
results. Aggregative quantifiers implement a pipeline that consists of generating classification predictions
|
|
|
|
|
and aggregating them. For this reason, the training phase is implemented by :meth:`classification_fit` followed
|
|
|
|
|
by :meth:`aggregation_fit`, while the testing phase is implemented by :meth:`classify` followed by
|
|
|
|
|
:meth:`aggregate`. Subclasses of this abstract class must provide implementations for these methods.
|
|
|
|
|
Aggregative quantifiers also maintain a :attr:`classifier` attribute.
|
|
|
|
|
|
|
|
|
|
The method :meth:`fit` comes with a default implementation based on :meth:`classification_fit`
|
|
|
|
|
and :meth:`aggregation_fit`.
|
|
|
|
|
|
|
|
|
|
The method :meth:`quantify` comes with a default implementation based on :meth:`classify`
|
|
|
|
|
and :meth:`aggregate`.
|
2020-12-03 18:12:28 +01:00
|
|
|
|
"""
|
|
|
|
|
|
2023-11-16 14:29:34 +01:00
|
|
|
|
val_split_ = None
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def val_split(self):
|
|
|
|
|
return self.val_split_
|
|
|
|
|
|
|
|
|
|
@val_split.setter
|
|
|
|
|
def val_split(self, val_split):
|
|
|
|
|
if isinstance(val_split, LabelledCollection):
|
|
|
|
|
print('warning: setting val_split with a LabelledCollection will be inefficient in'
|
|
|
|
|
'model selection. Rather pass the LabelledCollection at fit time')
|
|
|
|
|
self.val_split_ = val_split
|
|
|
|
|
|
2024-01-25 14:33:41 +01:00
|
|
|
|
def _check_init_parameters(self):
|
|
|
|
|
"""
|
|
|
|
|
Implements any check to be performed in the parameters of the init method before undertaking
|
|
|
|
|
the training of the quantifier. This is made as to allow for a quick execution stop when the
|
|
|
|
|
parameters are not valid.
|
|
|
|
|
|
|
|
|
|
:return: Nothing. May raise an exception.
|
|
|
|
|
"""
|
|
|
|
|
pass
|
|
|
|
|
|
2024-02-08 14:33:22 +01:00
|
|
|
|
def _check_non_empty_classes(self, data: LabelledCollection):
|
|
|
|
|
"""
|
|
|
|
|
Asserts all classes have positive instances.
|
|
|
|
|
|
|
|
|
|
:param data: LabelledCollection
|
|
|
|
|
:return: Nothing. May raise an exception.
|
|
|
|
|
"""
|
|
|
|
|
sample_prevs = data.prevalence()
|
|
|
|
|
empty_classes = np.argwhere(sample_prevs==0).flatten()
|
|
|
|
|
if len(empty_classes)>0:
|
|
|
|
|
empty_class_names = data.classes_[empty_classes]
|
|
|
|
|
raise ValueError(f'classes {empty_class_names} have no training examples')
|
|
|
|
|
|
2023-11-16 14:29:34 +01:00
|
|
|
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2023-11-12 13:04:19 +01:00
|
|
|
|
Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function.
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
2023-11-12 13:04:19 +01:00
|
|
|
|
:param fit_classifier: whether to train the learner (default is True). Set to False if the
|
2022-03-15 14:16:37 +01:00
|
|
|
|
learner has been trained outside the quantifier.
|
|
|
|
|
:return: self
|
|
|
|
|
"""
|
2024-01-25 14:33:41 +01:00
|
|
|
|
self._check_init_parameters()
|
2023-11-16 14:29:34 +01:00
|
|
|
|
classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on=val_split)
|
2023-11-15 10:55:13 +01:00
|
|
|
|
self.aggregation_fit(classif_predictions, data)
|
2023-11-12 13:04:19 +01:00
|
|
|
|
return self
|
|
|
|
|
|
2023-11-12 14:45:03 +01:00
|
|
|
|
def classifier_fit_predict(self, data: LabelledCollection, fit_classifier=True, predict_on=None):
|
2023-11-12 13:04:19 +01:00
|
|
|
|
"""
|
|
|
|
|
Trains the classifier if requested (`fit_classifier=True`) and generate the necessary predictions to
|
|
|
|
|
train the aggregation function.
|
|
|
|
|
|
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
|
|
|
|
:param fit_classifier: whether to train the learner (default is True). Set to False if the
|
|
|
|
|
learner has been trained outside the quantifier.
|
2023-11-12 14:45:03 +01:00
|
|
|
|
:param predict_on: specifies the set on which predictions need to be issued. This parameter can
|
|
|
|
|
be specified as None (default) to indicate no prediction is needed; a float in (0, 1) to
|
|
|
|
|
indicate the proportion of instances to be used for predictions (the remainder is used for
|
|
|
|
|
training); an integer >1 to indicate that the predictions must be generated via k-fold
|
|
|
|
|
cross-validation, using this integer as k; or the data sample itself on which to generate
|
|
|
|
|
the predictions.
|
|
|
|
|
"""
|
|
|
|
|
assert isinstance(fit_classifier, bool), 'unexpected type for "fit_classifier", must be boolean'
|
|
|
|
|
|
2023-11-13 17:03:24 +01:00
|
|
|
|
self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba'))
|
2023-11-12 14:45:03 +01:00
|
|
|
|
|
2024-02-08 14:33:22 +01:00
|
|
|
|
if fit_classifier:
|
|
|
|
|
self._check_non_empty_classes(data)
|
|
|
|
|
|
2023-11-16 14:29:34 +01:00
|
|
|
|
if predict_on is None:
|
|
|
|
|
predict_on = self.val_split
|
|
|
|
|
|
2023-11-12 14:45:03 +01:00
|
|
|
|
if predict_on is None:
|
|
|
|
|
if fit_classifier:
|
|
|
|
|
self.classifier.fit(*data.Xy)
|
|
|
|
|
predictions = None
|
|
|
|
|
elif isinstance(predict_on, float):
|
|
|
|
|
if fit_classifier:
|
|
|
|
|
if not (0. < predict_on < 1.):
|
|
|
|
|
raise ValueError(f'proportion {predict_on=} out of range, must be in (0,1)')
|
|
|
|
|
train, val = data.split_stratified(train_prop=(1 - predict_on))
|
|
|
|
|
self.classifier.fit(*train.Xy)
|
2024-02-14 12:27:19 +01:00
|
|
|
|
predictions = LabelledCollection(self.classify(val.X), val.y, classes=data.classes_)
|
2023-11-12 14:45:03 +01:00
|
|
|
|
else:
|
|
|
|
|
raise ValueError(f'wrong type for predict_on: since fit_classifier=False, '
|
|
|
|
|
f'the set on which predictions have to be issued must be '
|
|
|
|
|
f'explicitly indicated')
|
|
|
|
|
|
|
|
|
|
elif isinstance(predict_on, LabelledCollection):
|
|
|
|
|
if fit_classifier:
|
|
|
|
|
self.classifier.fit(*data.Xy)
|
2023-11-15 10:55:13 +01:00
|
|
|
|
predictions = LabelledCollection(self.classify(predict_on.X), predict_on.y, classes=predict_on.classes_)
|
2023-11-12 14:45:03 +01:00
|
|
|
|
|
|
|
|
|
elif isinstance(predict_on, int):
|
|
|
|
|
if fit_classifier:
|
2023-11-15 10:55:13 +01:00
|
|
|
|
if predict_on <= 1:
|
2023-11-12 14:45:03 +01:00
|
|
|
|
raise ValueError(f'invalid value {predict_on} in fit. '
|
|
|
|
|
f'Specify a integer >1 for kFCV estimation.')
|
2023-11-15 10:55:13 +01:00
|
|
|
|
else:
|
2024-01-25 14:33:41 +01:00
|
|
|
|
n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None)
|
2023-11-12 14:45:03 +01:00
|
|
|
|
predictions = cross_val_predict(
|
2024-01-25 14:33:41 +01:00
|
|
|
|
self.classifier, *data.Xy, cv=predict_on, n_jobs=n_jobs, method=self._classifier_method())
|
2023-11-15 10:55:13 +01:00
|
|
|
|
predictions = LabelledCollection(predictions, data.y, classes=data.classes_)
|
2023-11-12 14:45:03 +01:00
|
|
|
|
self.classifier.fit(*data.Xy)
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(f'wrong type for predict_on: since fit_classifier=False, '
|
|
|
|
|
f'the set on which predictions have to be issued must be '
|
|
|
|
|
f'explicitly indicated')
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f'error: param "predict_on" ({type(predict_on)}) not understood; '
|
|
|
|
|
f'use either a float indicating the split proportion, or a '
|
|
|
|
|
f'tuple (X,y) indicating the validation partition')
|
|
|
|
|
|
|
|
|
|
return predictions
|
2023-11-12 13:04:19 +01:00
|
|
|
|
|
|
|
|
|
@abstractmethod
|
2023-11-15 10:55:13 +01:00
|
|
|
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
2023-11-12 13:04:19 +01:00
|
|
|
|
"""
|
|
|
|
|
Trains the aggregation function.
|
|
|
|
|
|
2024-03-15 16:24:45 +01:00
|
|
|
|
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
|
|
|
|
|
as instances, the predictions issued by the classifier and, as labels, the true labels
|
2023-11-15 10:55:13 +01:00
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
2023-11-12 13:04:19 +01:00
|
|
|
|
"""
|
2022-03-15 14:16:37 +01:00
|
|
|
|
...
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2020-12-10 19:04:33 +01:00
|
|
|
|
@property
|
2023-01-27 18:13:23 +01:00
|
|
|
|
def classifier(self):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
|
|
|
|
Gives access to the classifier
|
|
|
|
|
|
|
|
|
|
:return: the classifier (typically an sklearn's Estimator)
|
|
|
|
|
"""
|
2023-01-27 18:13:23 +01:00
|
|
|
|
return self.classifier_
|
2020-12-10 19:04:33 +01:00
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
|
@classifier.setter
|
|
|
|
|
def classifier(self, classifier):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
|
|
|
|
Setter for the classifier
|
2021-12-15 15:27:43 +01:00
|
|
|
|
|
2022-03-15 14:16:37 +01:00
|
|
|
|
:param classifier: the classifier
|
|
|
|
|
"""
|
2023-01-27 18:13:23 +01:00
|
|
|
|
self.classifier_ = classifier
|
2020-12-10 19:04:33 +01:00
|
|
|
|
|
|
|
|
|
def classify(self, instances):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2022-05-25 19:14:33 +02:00
|
|
|
|
Provides the label predictions for the given instances. The predictions should respect the format expected by
|
2023-11-15 10:55:13 +01:00
|
|
|
|
:meth:`aggregate`, e.g., posterior probabilities for probabilistic quantifiers, or crisp predictions for
|
2024-01-17 09:33:39 +01:00
|
|
|
|
non-probabilistic quantifiers. The default one is "decision_function".
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
:param instances: array-like of shape `(n_instances, n_features,)`
|
2022-03-15 14:16:37 +01:00
|
|
|
|
:return: np.ndarray of shape `(n_instances,)` with label predictions
|
|
|
|
|
"""
|
2024-01-17 19:15:50 +01:00
|
|
|
|
return getattr(self.classifier, self._classifier_method())(instances)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2023-11-13 14:45:34 +01:00
|
|
|
|
def _classifier_method(self):
|
2023-11-15 10:55:13 +01:00
|
|
|
|
"""
|
2024-01-17 09:33:39 +01:00
|
|
|
|
Name of the method that must be used for issuing label predictions. The default one is "decision_function".
|
2023-11-15 10:55:13 +01:00
|
|
|
|
|
|
|
|
|
:return: string
|
|
|
|
|
"""
|
2024-01-17 09:33:39 +01:00
|
|
|
|
return 'decision_function'
|
2023-11-12 14:45:03 +01:00
|
|
|
|
|
2023-11-13 17:03:24 +01:00
|
|
|
|
def _check_classifier(self, adapt_if_necessary=False):
|
2023-11-15 10:55:13 +01:00
|
|
|
|
"""
|
|
|
|
|
Guarantees that the underlying classifier implements the method required for issuing predictions, i.e.,
|
|
|
|
|
the method indicated by the :meth:`_classifier_method`
|
|
|
|
|
|
|
|
|
|
:param adapt_if_necessary: if True, the method will try to comply with the required specifications
|
|
|
|
|
"""
|
2024-01-17 09:33:39 +01:00
|
|
|
|
assert hasattr(self.classifier, self._classifier_method()), \
|
|
|
|
|
f"the method does not implement the required {self._classifier_method()} method"
|
2023-11-12 14:45:03 +01:00
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def quantify(self, instances):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
|
|
|
|
Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated
|
|
|
|
|
by the classifier.
|
|
|
|
|
|
|
|
|
|
:param instances: array-like
|
2023-02-08 19:06:53 +01:00
|
|
|
|
:return: `np.ndarray` of shape `(n_classes)` with class prevalence estimates.
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2020-12-11 19:28:17 +01:00
|
|
|
|
classif_predictions = self.classify(instances)
|
2021-01-06 14:58:29 +01:00
|
|
|
|
return self.aggregate(classif_predictions)
|
2020-12-11 19:28:17 +01:00
|
|
|
|
|
|
|
|
|
@abstractmethod
|
2022-03-15 14:16:37 +01:00
|
|
|
|
def aggregate(self, classif_predictions: np.ndarray):
|
|
|
|
|
"""
|
|
|
|
|
Implements the aggregation of label predictions.
|
|
|
|
|
|
|
|
|
|
:param classif_predictions: `np.ndarray` of label predictions
|
2023-02-08 19:06:53 +01:00
|
|
|
|
:return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
|
|
|
|
...
|
2020-12-11 19:28:17 +01:00
|
|
|
|
|
2022-03-14 16:42:41 +01:00
|
|
|
|
@property
|
|
|
|
|
def classes_(self):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
|
|
|
|
Class labels, in the same order in which class prevalence values are to be computed.
|
|
|
|
|
This default implementation actually returns the class labels of the learner.
|
|
|
|
|
|
|
|
|
|
:return: array-like
|
|
|
|
|
"""
|
2023-01-27 18:13:23 +01:00
|
|
|
|
return self.classifier.classes_
|
2021-01-18 16:52:19 +01:00
|
|
|
|
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
class AggregativeCrispQuantifier(AggregativeQuantifier, ABC):
|
2020-12-03 18:12:28 +01:00
|
|
|
|
"""
|
2024-03-14 10:39:26 +01:00
|
|
|
|
Abstract class for quantification methods that base their estimations on the aggregation of crisp decisions
|
2023-11-15 10:55:13 +01:00
|
|
|
|
as returned by a hard classifier. Aggregative crisp quantifiers thus extend Aggregative
|
|
|
|
|
Quantifiers by implementing specifications about crisp predictions.
|
2020-12-03 18:12:28 +01:00
|
|
|
|
"""
|
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
def _classifier_method(self):
|
|
|
|
|
"""
|
2024-01-17 09:33:39 +01:00
|
|
|
|
Name of the method that must be used for issuing label predictions. For crisp quantifiers, the method
|
|
|
|
|
is 'predict', that returns an array of shape `(n_instances,)` of label predictions.
|
2023-11-15 10:55:13 +01:00
|
|
|
|
|
|
|
|
|
:return: the string "predict", i.e., the standard method name for scikit-learn hard predictions
|
|
|
|
|
"""
|
|
|
|
|
return 'predict'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AggregativeSoftQuantifier(AggregativeQuantifier, ABC):
|
|
|
|
|
"""
|
|
|
|
|
Abstract class for quantification methods that base their estimations on the aggregation of posterior
|
|
|
|
|
probabilities as returned by a probabilistic classifier.
|
|
|
|
|
Aggregative soft quantifiers thus extend Aggregative Quantifiers by implementing specifications
|
|
|
|
|
about soft predictions.
|
|
|
|
|
"""
|
|
|
|
|
|
2023-11-13 14:45:34 +01:00
|
|
|
|
def _classifier_method(self):
|
2023-11-15 10:55:13 +01:00
|
|
|
|
"""
|
2024-01-17 09:33:39 +01:00
|
|
|
|
Name of the method that must be used for issuing label predictions. For probabilistic quantifiers, the method
|
|
|
|
|
is 'predict_proba', that returns an array of shape `(n_instances, n_dimensions,)` with posterior
|
|
|
|
|
probabilities.
|
2023-11-15 10:55:13 +01:00
|
|
|
|
|
|
|
|
|
:return: the string "predict_proba", i.e., the standard method name for scikit-learn soft predictions
|
|
|
|
|
"""
|
2023-11-12 14:45:03 +01:00
|
|
|
|
return 'predict_proba'
|
|
|
|
|
|
2023-11-13 17:03:24 +01:00
|
|
|
|
def _check_classifier(self, adapt_if_necessary=False):
|
2023-11-15 10:55:13 +01:00
|
|
|
|
"""
|
|
|
|
|
Guarantees that the underlying classifier implements the method indicated by the :meth:`_classifier_method`.
|
|
|
|
|
In case it does not, the classifier is calibrated (by means of the Platt's calibration method implemented by
|
|
|
|
|
scikit-learn in CalibratedClassifierCV, with cv=5). This calibration is only allowed if `adapt_if_necessary`
|
|
|
|
|
is set to True. If otherwise (i.e., the classifier is not probabilistic, and `adapt_if_necessary` is set
|
|
|
|
|
to False), an exception will be raised.
|
|
|
|
|
|
|
|
|
|
:param adapt_if_necessary: a hard classifier is turned into a soft classifier if `adapt_if_necessary==True`
|
|
|
|
|
"""
|
2023-11-13 17:03:24 +01:00
|
|
|
|
if not hasattr(self.classifier, self._classifier_method()):
|
2023-11-12 14:45:03 +01:00
|
|
|
|
if adapt_if_necessary:
|
|
|
|
|
print(f'warning: The learner {self.classifier.__class__.__name__} does not seem to be '
|
|
|
|
|
f'probabilistic. The learner will be calibrated (using CalibratedClassifierCV).')
|
|
|
|
|
self.classifier = CalibratedClassifierCV(self.classifier, cv=5)
|
2020-12-22 17:43:23 +01:00
|
|
|
|
else:
|
2023-11-12 14:45:03 +01:00
|
|
|
|
raise AssertionError(f'error: The learner {self.classifier.__class__.__name__} does not '
|
|
|
|
|
f'seem to be probabilistic. The learner cannot be calibrated since '
|
|
|
|
|
f'fit_classifier is set to False')
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
|
2024-01-10 15:39:27 +01:00
|
|
|
|
class BinaryAggregativeQuantifier(AggregativeQuantifier, BinaryQuantifier):
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def pos_label(self):
|
|
|
|
|
return self.classifier.classes_[1]
|
2023-11-15 10:55:13 +01:00
|
|
|
|
|
2024-01-10 15:39:27 +01:00
|
|
|
|
@property
|
|
|
|
|
def neg_label(self):
|
|
|
|
|
return self.classifier.classes_[0]
|
|
|
|
|
|
|
|
|
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
|
|
|
|
|
self._check_binary(data, self.__class__.__name__)
|
|
|
|
|
return super().fit(data, fit_classifier, val_split)
|
2023-11-15 10:55:13 +01:00
|
|
|
|
|
|
|
|
|
|
2020-12-03 18:12:28 +01:00
|
|
|
|
# Methods
|
|
|
|
|
# ------------------------------------
|
2023-11-15 10:55:13 +01:00
|
|
|
|
class CC(AggregativeCrispQuantifier):
|
2020-12-03 18:12:28 +01:00
|
|
|
|
"""
|
2022-03-15 14:16:37 +01:00
|
|
|
|
The most basic Quantification method. One that simply classifies all instances and counts how many have been
|
|
|
|
|
attributed to each of the classes in order to compute class prevalence estimates.
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
|
:param classifier: a sklearn's Estimator that generates a classifier
|
2020-12-03 18:12:28 +01:00
|
|
|
|
"""
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
|
def __init__(self, classifier: BaseEstimator):
|
|
|
|
|
self.classifier = classifier
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
2023-11-12 13:04:19 +01:00
|
|
|
|
"""
|
|
|
|
|
Nothing to do here!
|
|
|
|
|
|
2024-03-15 16:24:45 +01:00
|
|
|
|
:param classif_predictions: not used
|
|
|
|
|
:param data: not used
|
2023-11-12 13:04:19 +01:00
|
|
|
|
"""
|
|
|
|
|
pass
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2022-03-15 14:16:37 +01:00
|
|
|
|
def aggregate(self, classif_predictions: np.ndarray):
|
|
|
|
|
"""
|
|
|
|
|
Computes class prevalence estimates by counting the prevalence of each of the predicted labels.
|
|
|
|
|
|
|
|
|
|
:param classif_predictions: array-like with label predictions
|
2023-02-08 19:06:53 +01:00
|
|
|
|
:return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2021-05-05 17:12:44 +02:00
|
|
|
|
return F.prevalence_from_labels(classif_predictions, self.classes_)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
|
2023-11-16 14:29:34 +01:00
|
|
|
|
class ACC(AggregativeCrispQuantifier):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
|
|
|
|
`Adjusted Classify & Count <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_,
|
|
|
|
|
the "adjusted" variant of :class:`CC`, that corrects the predictions of CC
|
|
|
|
|
according to the `misclassification rates`.
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
|
:param classifier: a sklearn's Estimator that generates a classifier
|
2023-11-15 10:55:13 +01:00
|
|
|
|
:param val_split: specifies the data used for generating classifier predictions. This specification
|
|
|
|
|
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
|
2024-01-25 14:33:41 +01:00
|
|
|
|
be extracted from the training set; or as an integer (default 5), indicating that the predictions
|
2023-11-15 10:55:13 +01:00
|
|
|
|
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
|
|
|
|
|
for `k`); or as a collection defining the specific set of data to use for validation.
|
|
|
|
|
Alternatively, this set can be specified at fit time by indicating the exact set of data
|
|
|
|
|
on which the predictions are to be generated.
|
|
|
|
|
:param n_jobs: number of parallel workers
|
2024-01-25 16:43:00 +01:00
|
|
|
|
:param solver: indicates the method to be used for obtaining the final estimates. The choice
|
2024-02-07 18:31:34 +01:00
|
|
|
|
'exact' comes down to solving the system of linear equations :math:`Ax=B` where `A` is a
|
2024-01-25 14:33:41 +01:00
|
|
|
|
matrix containing the class-conditional probabilities of the predictions (e.g., the tpr and fpr in
|
2024-02-07 18:31:34 +01:00
|
|
|
|
binary) and `B` is the vector of prevalence values estimated via CC, as :math:`x=A^{-1}B`. This solution
|
2024-01-25 14:33:41 +01:00
|
|
|
|
might not exist for degenerated classifiers, in which case the method defaults to classify and count
|
|
|
|
|
(i.e., does not attempt any adjustment).
|
2024-02-07 18:31:34 +01:00
|
|
|
|
Another option is to search for the prevalence vector that minimizes the L2 norm of :math:`|Ax-B|`. The latter
|
|
|
|
|
is achieved by indicating solver='minimize'. This one generally works better, and is the default parameter.
|
2024-01-25 18:03:35 +01:00
|
|
|
|
More details about this can be consulted in `Bunse, M. "On Multi-Class Extensions of Adjusted Classify and
|
|
|
|
|
Count", on proceedings of the 2nd International Workshop on Learning to Quantify: Methods and Applications
|
|
|
|
|
(LQ 2022), ECML/PKDD 2022, Grenoble (France) <https://lq-2022.github.io/proceedings/CompleteVolume.pdf>`_.
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2024-01-25 16:43:00 +01:00
|
|
|
|
def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None, solver='minimize'):
|
2023-01-27 18:13:23 +01:00
|
|
|
|
self.classifier = classifier
|
2021-01-22 18:01:51 +01:00
|
|
|
|
self.val_split = val_split
|
2023-02-08 19:06:53 +01:00
|
|
|
|
self.n_jobs = qp._get_njobs(n_jobs)
|
2024-01-25 14:33:41 +01:00
|
|
|
|
self.solver = solver
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2024-01-25 16:43:00 +01:00
|
|
|
|
def _check_init_parameters(self):
|
2024-03-15 14:01:24 +01:00
|
|
|
|
if self.solver not in ['exact', 'minimize']:
|
|
|
|
|
raise ValueError("unknown solver; valid ones are 'exact', 'minimize'")
|
2024-01-25 16:43:00 +01:00
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
2020-12-22 17:43:23 +01:00
|
|
|
|
"""
|
2023-11-12 14:45:03 +01:00
|
|
|
|
Estimates the misclassification rates.
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2024-03-15 16:24:45 +01:00
|
|
|
|
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
|
|
|
|
|
as instances, the label predictions issued by the classifier and, as labels, the true labels
|
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
2023-11-12 13:04:19 +01:00
|
|
|
|
"""
|
2023-11-13 17:03:24 +01:00
|
|
|
|
pred_labels, true_labels = classif_predictions.Xy
|
2023-11-12 13:04:19 +01:00
|
|
|
|
self.cc = CC(self.classifier)
|
|
|
|
|
self.Pte_cond_estim_ = self.getPteCondEstim(self.classifier.classes_, true_labels, pred_labels)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2022-03-14 16:42:41 +01:00
|
|
|
|
@classmethod
|
|
|
|
|
def getPteCondEstim(cls, classes, y, y_):
|
2024-01-25 14:33:41 +01:00
|
|
|
|
# estimate the matrix with entry (i,j) being the estimate of P(hat_yi|yj), that is, the probability that a
|
2022-03-14 16:42:41 +01:00
|
|
|
|
# document that belongs to yj ends up being classified as belonging to yi
|
|
|
|
|
conf = confusion_matrix(y, y_, labels=classes).T
|
2023-01-16 13:51:29 +01:00
|
|
|
|
conf = conf.astype(float)
|
2022-03-14 16:42:41 +01:00
|
|
|
|
class_counts = conf.sum(axis=0)
|
|
|
|
|
for i, _ in enumerate(classes):
|
|
|
|
|
if class_counts[i] == 0:
|
|
|
|
|
conf[i, i] = 1
|
|
|
|
|
else:
|
|
|
|
|
conf[:, i] /= class_counts[i]
|
|
|
|
|
return conf
|
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def aggregate(self, classif_predictions):
|
2020-12-11 19:28:17 +01:00
|
|
|
|
prevs_estim = self.cc.aggregate(classif_predictions)
|
2024-01-25 14:33:41 +01:00
|
|
|
|
return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim, solver=self.solver)
|
2020-12-11 19:28:17 +01:00
|
|
|
|
|
|
|
|
|
@classmethod
|
2024-01-25 14:33:41 +01:00
|
|
|
|
def solve_adjustment(cls, PteCondEstim, prevs_estim, solver='exact'):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
|
|
|
|
Solves the system linear system :math:`Ax = B` with :math:`A` = `PteCondEstim` and :math:`B` = `prevs_estim`
|
|
|
|
|
|
|
|
|
|
:param PteCondEstim: a `np.ndarray` of shape `(n_classes,n_classes,)` with entry `(i,j)` being the estimate
|
|
|
|
|
of :math:`P(y_i|y_j)`, that is, the probability that an instance that belongs to :math:`y_j` ends up being
|
|
|
|
|
classified as belonging to :math:`y_i`
|
|
|
|
|
:param prevs_estim: a `np.ndarray` of shape `(n_classes,)` with the class prevalence estimates
|
2024-01-25 14:33:41 +01:00
|
|
|
|
:param solver: indicates the method to use for solving the system of linear equations. Valid options are
|
|
|
|
|
'exact' (tries to solve the system --may fail if the misclassificatin matrix has rank < n_classes) or
|
|
|
|
|
'optim_minimize' (minimizes a norm --always exists).
|
2022-03-15 14:16:37 +01:00
|
|
|
|
:return: an adjusted `np.ndarray` of shape `(n_classes,)` with the corrected class prevalence estimates
|
|
|
|
|
"""
|
2024-01-25 16:43:00 +01:00
|
|
|
|
|
2024-03-15 17:58:23 +01:00
|
|
|
|
estimate = F.solve_adjustment(
|
|
|
|
|
p_c_y=PteCondEstim,
|
|
|
|
|
p_c=prevs_estim,
|
|
|
|
|
solver=solver,
|
|
|
|
|
method='inversion',
|
|
|
|
|
)
|
|
|
|
|
return F.clip_prevalence(estimate, method="clip")
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
class PCC(AggregativeSoftQuantifier):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
|
|
|
|
`Probabilistic Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_,
|
|
|
|
|
the probabilistic variant of CC that relies on the posterior probabilities returned by a probabilistic classifier.
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
|
:param classifier: a sklearn's Estimator that generates a classifier
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
|
def __init__(self, classifier: BaseEstimator):
|
|
|
|
|
self.classifier = classifier
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
2023-11-12 13:04:19 +01:00
|
|
|
|
"""
|
|
|
|
|
Nothing to do here!
|
|
|
|
|
|
2024-03-15 16:24:45 +01:00
|
|
|
|
:param classif_predictions: not used
|
|
|
|
|
:param data: not used
|
2023-11-12 13:04:19 +01:00
|
|
|
|
"""
|
|
|
|
|
pass
|
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def aggregate(self, classif_posteriors):
|
2020-12-11 19:28:17 +01:00
|
|
|
|
return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
|
2023-11-16 14:29:34 +01:00
|
|
|
|
class PACC(AggregativeSoftQuantifier):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
|
|
|
|
`Probabilistic Adjusted Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_,
|
|
|
|
|
the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier.
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
|
:param classifier: a sklearn's Estimator that generates a classifier
|
2023-11-15 10:55:13 +01:00
|
|
|
|
:param val_split: specifies the data used for generating classifier predictions. This specification
|
|
|
|
|
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
|
2024-01-25 14:33:41 +01:00
|
|
|
|
be extracted from the training set; or as an integer (default 5), indicating that the predictions
|
2023-11-15 10:55:13 +01:00
|
|
|
|
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
|
|
|
|
|
for `k`). Alternatively, this set can be specified at fit time by indicating the exact set of data
|
|
|
|
|
on which the predictions are to be generated.
|
2022-07-11 14:00:25 +02:00
|
|
|
|
:param n_jobs: number of parallel workers
|
2024-01-25 18:03:35 +01:00
|
|
|
|
:param solver: indicates the method to be used for obtaining the final estimates. The choice
|
2024-02-07 18:31:34 +01:00
|
|
|
|
'exact' comes down to solving the system of linear equations :math:`Ax=B` where `A` is a
|
2024-01-25 18:03:35 +01:00
|
|
|
|
matrix containing the class-conditional probabilities of the predictions (e.g., the tpr and fpr in
|
2024-02-07 18:31:34 +01:00
|
|
|
|
binary) and `B` is the vector of prevalence values estimated via CC, as :math:`x=A^{-1}B`. This solution
|
2024-01-25 18:03:35 +01:00
|
|
|
|
might not exist for degenerated classifiers, in which case the method defaults to classify and count
|
|
|
|
|
(i.e., does not attempt any adjustment).
|
2024-02-07 18:31:34 +01:00
|
|
|
|
Another option is to search for the prevalence vector that minimizes the L2 norm of :math:`|Ax-B|`. The latter
|
|
|
|
|
is achieved by indicating solver='minimize'. This one generally works better, and is the default parameter.
|
2024-01-25 18:03:35 +01:00
|
|
|
|
More details about this can be consulted in `Bunse, M. "On Multi-Class Extensions of Adjusted Classify and
|
|
|
|
|
Count", on proceedings of the 2nd International Workshop on Learning to Quantify: Methods and Applications
|
|
|
|
|
(LQ 2022), ECML/PKDD 2022, Grenoble (France) <https://lq-2022.github.io/proceedings/CompleteVolume.pdf>`_.
|
|
|
|
|
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2024-01-25 16:43:00 +01:00
|
|
|
|
def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None, solver='minimize'):
|
2023-01-27 18:13:23 +01:00
|
|
|
|
self.classifier = classifier
|
2021-01-22 18:01:51 +01:00
|
|
|
|
self.val_split = val_split
|
2023-02-08 19:06:53 +01:00
|
|
|
|
self.n_jobs = qp._get_njobs(n_jobs)
|
2024-01-25 16:43:00 +01:00
|
|
|
|
self.solver = solver
|
|
|
|
|
|
|
|
|
|
def _check_init_parameters(self):
|
|
|
|
|
assert self.solver in ['exact', 'minimize'], "unknown solver; valid ones are 'exact', 'minimize'"
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
2020-12-22 17:43:23 +01:00
|
|
|
|
"""
|
2023-11-12 14:45:03 +01:00
|
|
|
|
Estimates the misclassification rates
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2024-03-15 16:24:45 +01:00
|
|
|
|
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
|
|
|
|
|
as instances, the posterior probabilities issued by the classifier and, as labels, the true labels
|
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
2023-11-12 13:04:19 +01:00
|
|
|
|
"""
|
2023-11-15 10:55:13 +01:00
|
|
|
|
posteriors, true_labels = classif_predictions.Xy
|
2023-11-12 13:04:19 +01:00
|
|
|
|
self.pcc = PCC(self.classifier)
|
|
|
|
|
self.Pte_cond_estim_ = self.getPteCondEstim(self.classifier.classes_, true_labels, posteriors)
|
2022-03-14 16:42:41 +01:00
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
def aggregate(self, classif_posteriors):
|
|
|
|
|
prevs_estim = self.pcc.aggregate(classif_posteriors)
|
2024-01-25 16:43:00 +01:00
|
|
|
|
return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim, solver=self.solver)
|
2023-11-15 10:55:13 +01:00
|
|
|
|
|
2022-03-14 16:42:41 +01:00
|
|
|
|
@classmethod
|
|
|
|
|
def getPteCondEstim(cls, classes, y, y_):
|
2024-01-25 14:33:41 +01:00
|
|
|
|
# estimate the matrix with entry (i,j) being the estimate of P(hat_yi|yj), that is, the probability that a
|
2021-01-11 12:55:06 +01:00
|
|
|
|
# document that belongs to yj ends up being classified as belonging to yi
|
2021-06-21 12:55:39 +02:00
|
|
|
|
n_classes = len(classes)
|
2022-03-14 16:42:41 +01:00
|
|
|
|
confusion = np.eye(n_classes)
|
2021-06-21 12:55:39 +02:00
|
|
|
|
for i, class_ in enumerate(classes):
|
2022-03-14 16:42:41 +01:00
|
|
|
|
idx = y == class_
|
|
|
|
|
if idx.any():
|
|
|
|
|
confusion[i] = y_[idx].mean(axis=0)
|
2021-01-06 14:58:29 +01:00
|
|
|
|
|
2022-03-14 16:42:41 +01:00
|
|
|
|
return confusion.T
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
class EMQ(AggregativeSoftQuantifier):
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2022-03-15 14:16:37 +01:00
|
|
|
|
`Expectation Maximization for Quantification <https://ieeexplore.ieee.org/abstract/document/6789744>`_ (EMQ),
|
|
|
|
|
aka `Saerens-Latinne-Decaestecker` (SLD) algorithm.
|
|
|
|
|
EMQ consists of using the well-known `Expectation Maximization algorithm` to iteratively update the posterior
|
|
|
|
|
probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via
|
|
|
|
|
maximum-likelihood estimation, in a mutually recursive way, until convergence.
|
|
|
|
|
|
2024-01-25 14:33:41 +01:00
|
|
|
|
This implementation also gives access to the heuristics proposed by `Alexandari et al. paper
|
|
|
|
|
<http://proceedings.mlr.press/v119/alexandari20a.html>`_. These heuristics consist of using, as the training
|
|
|
|
|
prevalence, an estimate of it obtained via k-fold cross validation (instead of the true training prevalence),
|
|
|
|
|
and to recalibrate the posterior probabilities of the classifier.
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
|
:param classifier: a sklearn's Estimator that generates a classifier
|
2024-01-25 14:33:41 +01:00
|
|
|
|
:param val_split: specifies the data used for generating classifier predictions. This specification
|
|
|
|
|
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
|
|
|
|
|
be extracted from the training set; or as an integer, indicating that the predictions
|
|
|
|
|
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
|
|
|
|
|
for `k`, default 5); or as a collection defining the specific set of data to use for validation.
|
|
|
|
|
Alternatively, this set can be specified at fit time by indicating the exact set of data
|
|
|
|
|
on which the predictions are to be generated. This hyperparameter is only meant to be used when the
|
|
|
|
|
heuristics are to be applied, i.e., if a recalibration is required. The default value is None (meaning
|
|
|
|
|
the recalibration is not required). In case this hyperparameter is set to a value other than None, but
|
|
|
|
|
the recalibration is not required (recalib=None), a warning message will be raised.
|
|
|
|
|
:param exact_train_prev: set to True (default) for using the true training prevalence as the initial observation;
|
|
|
|
|
set to False for computing the training prevalence as an estimate of it, i.e., as the expected
|
|
|
|
|
value of the posterior probabilities of the training instances.
|
|
|
|
|
:param recalib: a string indicating the method of recalibration.
|
|
|
|
|
Available choices include "nbvs" (No-Bias Vector Scaling), "bcts" (Bias-Corrected Temperature Scaling,
|
|
|
|
|
default), "ts" (Temperature Scaling), and "vs" (Vector Scaling). Default is None (no recalibration).
|
|
|
|
|
:param n_jobs: number of parallel workers. Only used for recalibrating the classifier if `val_split` is set to
|
|
|
|
|
an integer `k` --the number of folds.
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
MAX_ITER = 1000
|
|
|
|
|
EPSILON = 1e-4
|
|
|
|
|
|
2024-01-25 14:33:41 +01:00
|
|
|
|
def __init__(self, classifier: BaseEstimator, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None):
|
2023-01-27 18:13:23 +01:00
|
|
|
|
self.classifier = classifier
|
2024-01-25 14:33:41 +01:00
|
|
|
|
self.val_split = val_split
|
|
|
|
|
self.exact_train_prev = exact_train_prev
|
|
|
|
|
self.recalib = recalib
|
|
|
|
|
self.n_jobs = n_jobs
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def EMQ_BCTS(cls, classifier: BaseEstimator, n_jobs=None):
|
|
|
|
|
"""
|
|
|
|
|
Constructs an instance of EMQ using the best configuration found in the `Alexandari et al. paper
|
|
|
|
|
<http://proceedings.mlr.press/v119/alexandari20a.html>`_, i.e., one that relies on Bias-Corrected Temperature
|
|
|
|
|
Scaling (BCTS) as a recalibration function, and that uses an estimate of the training prevalence instead of
|
|
|
|
|
the true training prevalence.
|
|
|
|
|
|
|
|
|
|
:param classifier: a sklearn's Estimator that generates a classifier
|
|
|
|
|
:param n_jobs: number of parallel workers.
|
|
|
|
|
:return: An instance of EMQ with BCTS
|
|
|
|
|
"""
|
|
|
|
|
return EMQ(classifier, val_split=5, exact_train_prev=False, recalib='bcts', n_jobs=n_jobs)
|
|
|
|
|
|
|
|
|
|
def _check_init_parameters(self):
|
|
|
|
|
if self.val_split is not None:
|
|
|
|
|
if self.exact_train_prev and self.recalib is None:
|
|
|
|
|
raise RuntimeWarning(f'The parameter {self.val_split=} was specified for EMQ, while the parameters '
|
|
|
|
|
f'{self.exact_train_prev=} and {self.recalib=}. This has no effect and causes an unnecessary '
|
|
|
|
|
f'overload.')
|
2024-02-14 12:27:19 +01:00
|
|
|
|
else:
|
|
|
|
|
if self.recalib is not None:
|
|
|
|
|
print(f'[warning] The parameter {self.recalib=} requires the val_split be different from None. '
|
|
|
|
|
f'This parameter will be set to 5. To avoid this warning, set this value to a float value '
|
|
|
|
|
f'indicating the proportion of training data to be used as validation, or to an integer '
|
|
|
|
|
f'indicating the number of folds for kFCV.')
|
|
|
|
|
self.val_split=5
|
2024-01-25 14:33:41 +01:00
|
|
|
|
|
|
|
|
|
def classify(self, instances):
|
|
|
|
|
"""
|
|
|
|
|
Provides the posterior probabilities for the given instances. If the classifier was required
|
|
|
|
|
to be recalibrated, then these posteriors are recalibrated accordingly.
|
|
|
|
|
|
|
|
|
|
:param instances: array-like of shape `(n_instances, n_dimensions,)`
|
|
|
|
|
:return: np.ndarray of shape `(n_instances, n_classes,)` with posterior probabilities
|
|
|
|
|
"""
|
|
|
|
|
posteriors = self.classifier.predict_proba(instances)
|
|
|
|
|
if hasattr(self, 'calibration_function') and self.calibration_function is not None:
|
|
|
|
|
posteriors = self.calibration_function(posteriors)
|
|
|
|
|
return posteriors
|
2023-11-12 13:04:19 +01:00
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
2024-03-15 16:24:45 +01:00
|
|
|
|
"""
|
|
|
|
|
Trains the aggregation function of EMQ. This comes down to recalibrating the posterior probabilities
|
|
|
|
|
ir requested.
|
|
|
|
|
|
|
|
|
|
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
|
|
|
|
|
as instances, the posterior probabilities issued by the classifier and, as labels, the true labels
|
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
|
|
|
|
"""
|
2024-01-25 14:33:41 +01:00
|
|
|
|
if self.recalib is not None:
|
|
|
|
|
P, y = classif_predictions.Xy
|
|
|
|
|
if self.recalib == 'nbvs':
|
|
|
|
|
calibrator = NoBiasVectorScaling()
|
|
|
|
|
elif self.recalib == 'bcts':
|
|
|
|
|
calibrator = TempScaling(bias_positions='all')
|
|
|
|
|
elif self.recalib == 'ts':
|
|
|
|
|
calibrator = TempScaling()
|
|
|
|
|
elif self.recalib == 'vs':
|
|
|
|
|
calibrator = VectorScaling()
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError('invalid param argument for recalibration method; available ones are '
|
|
|
|
|
'"nbvs", "bcts", "ts", and "vs".')
|
|
|
|
|
|
2024-02-23 16:29:53 +01:00
|
|
|
|
if not np.issubdtype(y.dtype, np.number):
|
|
|
|
|
y = np.searchsorted(data.classes_, y)
|
2024-01-25 14:33:41 +01:00
|
|
|
|
self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
|
|
|
|
|
|
|
|
|
|
if self.exact_train_prev:
|
|
|
|
|
self.train_prevalence = data.prevalence()
|
|
|
|
|
else:
|
|
|
|
|
train_posteriors = classif_predictions.X
|
|
|
|
|
if self.recalib is not None:
|
|
|
|
|
train_posteriors = self.calibration_function(train_posteriors)
|
|
|
|
|
self.train_prevalence = F.prevalence_from_probabilities(train_posteriors)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2020-12-11 19:28:17 +01:00
|
|
|
|
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
2021-01-18 10:53:22 +01:00
|
|
|
|
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
|
|
|
|
|
return priors
|
|
|
|
|
|
|
|
|
|
def predict_proba(self, instances, epsilon=EPSILON):
|
2023-11-15 10:55:13 +01:00
|
|
|
|
"""
|
|
|
|
|
Returns the posterior probabilities updated by the EM algorithm.
|
|
|
|
|
|
|
|
|
|
:param instances: np.ndarray of shape `(n_instances, n_dimensions)`
|
|
|
|
|
:param epsilon: error tolerance
|
|
|
|
|
:return: np.ndarray of shape `(n_instances, n_classes)`
|
|
|
|
|
"""
|
|
|
|
|
classif_posteriors = self.classify(instances)
|
2021-01-18 10:53:22 +01:00
|
|
|
|
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
|
|
|
|
|
return posteriors
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
@classmethod
|
2020-12-15 15:20:35 +01:00
|
|
|
|
def EM(cls, tr_prev, posterior_probabilities, epsilon=EPSILON):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
|
|
|
|
Computes the `Expectation Maximization` routine.
|
2021-12-15 15:27:43 +01:00
|
|
|
|
|
2022-03-15 14:16:37 +01:00
|
|
|
|
:param tr_prev: array-like, the training prevalence
|
|
|
|
|
:param posterior_probabilities: `np.ndarray` of shape `(n_instances, n_classes,)` with the
|
|
|
|
|
posterior probabilities
|
|
|
|
|
:param epsilon: float, the threshold different between two consecutive iterations
|
|
|
|
|
to reach before stopping the loop
|
|
|
|
|
:return: a tuple with the estimated prevalence values (shape `(n_classes,)`) and
|
|
|
|
|
the corrected posterior probabilities (shape `(n_instances, n_classes,)`)
|
|
|
|
|
"""
|
2020-12-03 18:12:28 +01:00
|
|
|
|
Px = posterior_probabilities
|
|
|
|
|
Ptr = np.copy(tr_prev)
|
2024-02-23 16:29:53 +01:00
|
|
|
|
|
|
|
|
|
if np.product(Ptr) == 0: # some entry is 0; we should smooth the values to avoid 0 division
|
|
|
|
|
Ptr += epsilon
|
|
|
|
|
Ptr /= Ptr.sum()
|
|
|
|
|
|
2020-12-03 18:12:28 +01:00
|
|
|
|
qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence
|
|
|
|
|
|
|
|
|
|
s, converged = 0, False
|
|
|
|
|
qs_prev_ = None
|
2021-01-07 17:58:48 +01:00
|
|
|
|
while not converged and s < EMQ.MAX_ITER:
|
2021-03-11 19:00:40 +01:00
|
|
|
|
# E-step: ps is Ps(y|xi)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
ps_unnormalized = (qs / Ptr) * Px
|
2021-03-11 19:00:40 +01:00
|
|
|
|
ps = ps_unnormalized / ps_unnormalized.sum(axis=1, keepdims=True)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-03-11 19:00:40 +01:00
|
|
|
|
# M-step:
|
2020-12-03 18:12:28 +01:00
|
|
|
|
qs = ps.mean(axis=0)
|
|
|
|
|
|
2021-05-05 17:12:44 +02:00
|
|
|
|
if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s > 10:
|
2020-12-03 18:12:28 +01:00
|
|
|
|
converged = True
|
|
|
|
|
|
|
|
|
|
qs_prev_ = qs
|
2020-12-29 20:33:59 +01:00
|
|
|
|
s += 1
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
if not converged:
|
2021-01-25 09:02:11 +01:00
|
|
|
|
print('[warning] the method has reached the maximum number of iterations; it might have not converged')
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-01-18 10:53:22 +01:00
|
|
|
|
return qs, ps
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
|
2024-03-15 16:24:45 +01:00
|
|
|
|
class BayesianCC(AggregativeCrispQuantifier):
|
|
|
|
|
"""
|
|
|
|
|
`Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ method,
|
|
|
|
|
which is a variant of :class:`ACC` that calculates the posterior probability distribution
|
|
|
|
|
over the prevalence vectors, rather than providing a point estimate obtained
|
|
|
|
|
by matrix inversion.
|
|
|
|
|
|
|
|
|
|
Can be used to diagnose degeneracy in the predictions visible when the confusion
|
|
|
|
|
matrix has high condition number or to quantify uncertainty around the point estimate.
|
|
|
|
|
|
|
|
|
|
This method relies on extra dependencies, which have to be installed via:
|
|
|
|
|
`$ pip install quapy[bayes]`
|
|
|
|
|
|
|
|
|
|
:param classifier: a sklearn's Estimator that generates a classifier
|
|
|
|
|
:param val_split: a float in (0, 1) indicating the proportion of the training data to be used,
|
|
|
|
|
as a stratified held-out validation set, for generating classifier predictions.
|
|
|
|
|
:param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
|
|
|
|
|
:param num_samples: number of samples to draw from the posterior (default 1000)
|
|
|
|
|
:param mcmc_seed: random seed for the MCMC sampler (default 0)
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self,
|
|
|
|
|
classifier: BaseEstimator,
|
|
|
|
|
val_split: float = 0.75,
|
|
|
|
|
num_warmup: int = 500,
|
|
|
|
|
num_samples: int = 1_000,
|
|
|
|
|
mcmc_seed: int = 0):
|
|
|
|
|
|
|
|
|
|
if num_warmup <= 0:
|
|
|
|
|
raise ValueError(f'parameter {num_warmup=} must be a positive integer')
|
|
|
|
|
if num_samples <= 0:
|
|
|
|
|
raise ValueError(f'parameter {num_samples=} must be a positive integer')
|
|
|
|
|
|
|
|
|
|
if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1:
|
|
|
|
|
raise ValueError(f'val_split must be a float in (0, 1), got {val_split}')
|
|
|
|
|
|
|
|
|
|
if _bayesian.DEPENDENCIES_INSTALLED is False:
|
|
|
|
|
raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")
|
|
|
|
|
|
|
|
|
|
self.classifier = classifier
|
|
|
|
|
self.val_split = val_split
|
|
|
|
|
self.num_warmup = num_warmup
|
|
|
|
|
self.num_samples = num_samples
|
|
|
|
|
self.mcmc_seed = mcmc_seed
|
|
|
|
|
|
|
|
|
|
# Array of shape (n_classes, n_predicted_classes,) where entry (y, c) is the number of instances
|
|
|
|
|
# labeled as class y and predicted as class c.
|
|
|
|
|
# By default, this array is set to None and later defined as part of the `aggregation_fit` phase
|
|
|
|
|
self._n_and_c_labeled = None
|
|
|
|
|
|
|
|
|
|
# Dictionary with posterior samples, set when `aggregate` is provided.
|
|
|
|
|
self._samples = None
|
|
|
|
|
|
|
|
|
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
|
|
|
|
"""
|
|
|
|
|
Estimates the misclassification rates.
|
|
|
|
|
|
|
|
|
|
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
|
|
|
|
|
as instances, the label predictions issued by the classifier and, as labels, the true labels
|
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
|
|
|
|
"""
|
|
|
|
|
pred_labels, true_labels = classif_predictions.Xy
|
|
|
|
|
self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels, labels=self.classifier.classes_)
|
|
|
|
|
|
|
|
|
|
def sample_from_posterior(self, classif_predictions):
|
|
|
|
|
if self._n_and_c_labeled is None:
|
|
|
|
|
raise ValueError("aggregation_fit must be called before sample_from_posterior")
|
|
|
|
|
|
|
|
|
|
n_c_unlabeled = F.counts_from_labels(classif_predictions, self.classifier.classes_)
|
|
|
|
|
|
|
|
|
|
self._samples = _bayesian.sample_posterior(
|
|
|
|
|
n_c_unlabeled=n_c_unlabeled,
|
|
|
|
|
n_y_and_c_labeled=self._n_and_c_labeled,
|
|
|
|
|
num_warmup=self.num_warmup,
|
|
|
|
|
num_samples=self.num_samples,
|
|
|
|
|
seed=self.mcmc_seed,
|
|
|
|
|
)
|
|
|
|
|
return self._samples
|
|
|
|
|
|
|
|
|
|
def get_prevalence_samples(self):
|
|
|
|
|
if self._samples is None:
|
|
|
|
|
raise ValueError("sample_from_posterior must be called before get_prevalence_samples")
|
|
|
|
|
return self._samples[_bayesian.P_TEST_Y]
|
|
|
|
|
|
|
|
|
|
def get_conditional_probability_samples(self):
|
|
|
|
|
if self._samples is None:
|
|
|
|
|
raise ValueError("sample_from_posterior must be called before get_conditional_probability_samples")
|
|
|
|
|
return self._samples[_bayesian.P_C_COND_Y]
|
|
|
|
|
|
|
|
|
|
def aggregate(self, classif_predictions):
|
|
|
|
|
samples = self.sample_from_posterior(classif_predictions)[_bayesian.P_TEST_Y]
|
|
|
|
|
return np.asarray(samples.mean(axis=0), dtype=float)
|
|
|
|
|
|
|
|
|
|
|
2024-01-10 15:39:27 +01:00
|
|
|
|
class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
|
2020-12-10 19:04:33 +01:00
|
|
|
|
"""
|
2022-03-15 14:16:37 +01:00
|
|
|
|
`Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy).
|
|
|
|
|
HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of
|
2023-11-08 15:34:17 +01:00
|
|
|
|
minimizing the divergence (in terms of the Hellinger Distance) between two distributions of posterior
|
2022-03-15 14:16:37 +01:00
|
|
|
|
probabilities returned by the classifier. One of the distributions is generated from the unlabelled examples and
|
|
|
|
|
the other is generated from a validation set. This latter distribution is defined as a mixture of the
|
|
|
|
|
class-conditional distributions of the posterior probabilities returned for the positive and negative validation
|
|
|
|
|
examples, respectively. The parameters of the mixture thus represent the estimates of the class prevalence values.
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
|
:param classifier: a sklearn's Estimator that generates a binary classifier
|
2022-03-15 14:16:37 +01:00
|
|
|
|
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
|
2024-01-10 15:39:27 +01:00
|
|
|
|
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
|
2020-12-10 19:04:33 +01:00
|
|
|
|
"""
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2024-01-10 15:39:27 +01:00
|
|
|
|
def __init__(self, classifier: BaseEstimator, val_split=5):
|
2023-01-27 18:13:23 +01:00
|
|
|
|
self.classifier = classifier
|
2021-01-22 18:01:51 +01:00
|
|
|
|
self.val_split = val_split
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
2020-12-22 17:43:23 +01:00
|
|
|
|
"""
|
2024-03-15 16:24:45 +01:00
|
|
|
|
Trains the aggregation function of HDy.
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2024-03-15 16:24:45 +01:00
|
|
|
|
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
|
|
|
|
|
as instances, the posterior probabilities issued by the classifier and, as labels, the true labels
|
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
2020-12-22 17:43:23 +01:00
|
|
|
|
"""
|
2023-11-15 10:55:13 +01:00
|
|
|
|
P, y = classif_predictions.Xy
|
2024-01-10 15:39:27 +01:00
|
|
|
|
Px = P[:, self.pos_label] # takes only the P(y=+1|x)
|
|
|
|
|
self.Pxy1 = Px[y == self.pos_label]
|
|
|
|
|
self.Pxy0 = Px[y == self.neg_label]
|
2023-11-15 10:55:13 +01:00
|
|
|
|
|
2021-01-18 16:52:19 +01:00
|
|
|
|
# pre-compute the histogram for positive and negative examples
|
2021-05-05 17:12:44 +02:00
|
|
|
|
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
|
2023-11-15 10:55:13 +01:00
|
|
|
|
|
2023-11-09 18:13:54 +01:00
|
|
|
|
def hist(P, bins):
|
|
|
|
|
h = np.histogram(P, bins=bins, range=(0, 1), density=True)[0]
|
|
|
|
|
return h / h.sum()
|
2023-11-15 10:55:13 +01:00
|
|
|
|
|
2023-11-09 18:13:54 +01:00
|
|
|
|
self.Pxy1_density = {bins: hist(self.Pxy1, bins) for bins in self.bins}
|
|
|
|
|
self.Pxy0_density = {bins: hist(self.Pxy0, bins) for bins in self.bins}
|
2023-11-15 10:55:13 +01:00
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def aggregate(self, classif_posteriors):
|
2020-12-10 19:04:33 +01:00
|
|
|
|
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
|
|
|
|
|
# and the final estimated a priori probability was taken as the median of these 11 estimates."
|
|
|
|
|
# (González-Castro, et al., 2013).
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2024-01-10 15:39:27 +01:00
|
|
|
|
Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2020-12-10 19:04:33 +01:00
|
|
|
|
prev_estimations = []
|
2021-05-05 17:12:44 +02:00
|
|
|
|
# for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
|
|
|
|
|
# Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
|
|
|
|
|
# Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
|
2021-01-18 16:52:19 +01:00
|
|
|
|
for bins in self.bins:
|
|
|
|
|
Pxy0_density = self.Pxy0_density[bins]
|
|
|
|
|
Pxy1_density = self.Pxy1_density[bins]
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2020-12-10 19:04:33 +01:00
|
|
|
|
Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)
|
|
|
|
|
|
2023-11-08 15:34:17 +01:00
|
|
|
|
# the authors proposed to search for the prevalence yielding the best matching as a linear search
|
|
|
|
|
# at small steps (modern implementations resort to an optimization procedure,
|
|
|
|
|
# see class DistributionMatching)
|
2020-12-10 19:04:33 +01:00
|
|
|
|
prev_selected, min_dist = None, None
|
2023-11-15 10:55:13 +01:00
|
|
|
|
for prev in F.prevalence_linspace(n_prevalences=101, repeats=1, smooth_limits_epsilon=0.0):
|
2021-05-05 17:12:44 +02:00
|
|
|
|
Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density
|
2021-01-06 14:58:29 +01:00
|
|
|
|
hdy = F.HellingerDistance(Px_train, Px_test)
|
2020-12-10 19:04:33 +01:00
|
|
|
|
if prev_selected is None or hdy < min_dist:
|
|
|
|
|
prev_selected, min_dist = prev, hdy
|
|
|
|
|
prev_estimations.append(prev_selected)
|
|
|
|
|
|
2021-05-05 17:12:44 +02:00
|
|
|
|
class1_prev = np.median(prev_estimations)
|
2024-01-17 09:33:39 +01:00
|
|
|
|
return F.as_binary_prevalence(class1_prev)
|
2020-12-10 19:04:33 +01:00
|
|
|
|
|
|
|
|
|
|
2024-01-10 15:39:27 +01:00
|
|
|
|
class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2022-07-11 12:21:49 +02:00
|
|
|
|
`DyS framework <https://ojs.aaai.org/index.php/AAAI/article/view/4376>`_ (DyS).
|
|
|
|
|
DyS is a generalization of HDy method, using a Ternary Search in order to find the prevalence that
|
|
|
|
|
minimizes the distance between distributions.
|
|
|
|
|
Details for the ternary search have been got from <https://dl.acm.org/doi/pdf/10.1145/3219819.3220059>
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
|
:param classifier: a sklearn's Estimator that generates a binary classifier
|
2022-07-11 12:21:49 +02:00
|
|
|
|
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
|
2024-01-10 15:39:27 +01:00
|
|
|
|
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
|
2022-07-11 12:21:49 +02:00
|
|
|
|
:param n_bins: an int with the number of bins to use to compute the histograms.
|
2023-01-31 15:08:58 +01:00
|
|
|
|
:param divergence: a str indicating the name of divergence (currently supported ones are "HD" or "topsoe"), or a
|
|
|
|
|
callable function computes the divergence between two distributions (two equally sized arrays).
|
2022-07-11 12:21:49 +02:00
|
|
|
|
:param tol: a float with the tolerance for the ternary search algorithm.
|
2024-01-25 14:33:41 +01:00
|
|
|
|
:param n_jobs: number of parallel workers.
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2024-01-25 14:33:41 +01:00
|
|
|
|
def __init__(self, classifier: BaseEstimator, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None):
|
2023-01-27 18:13:23 +01:00
|
|
|
|
self.classifier = classifier
|
2022-07-11 12:21:49 +02:00
|
|
|
|
self.val_split = val_split
|
|
|
|
|
self.tol = tol
|
2023-01-31 15:08:58 +01:00
|
|
|
|
self.divergence = divergence
|
2022-07-11 12:21:49 +02:00
|
|
|
|
self.n_bins = n_bins
|
2024-01-25 14:33:41 +01:00
|
|
|
|
self.n_jobs = n_jobs
|
2022-07-11 12:21:49 +02:00
|
|
|
|
|
|
|
|
|
def _ternary_search(self, f, left, right, tol):
|
|
|
|
|
"""
|
|
|
|
|
Find maximum of unimodal function f() within [left, right]
|
|
|
|
|
"""
|
|
|
|
|
while abs(right - left) >= tol:
|
|
|
|
|
left_third = left + (right - left) / 3
|
|
|
|
|
right_third = right - (right - left) / 3
|
|
|
|
|
|
|
|
|
|
if f(left_third) > f(right_third):
|
|
|
|
|
left = left_third
|
|
|
|
|
else:
|
|
|
|
|
right = right_third
|
|
|
|
|
|
|
|
|
|
# Left and right are the current bounds; the maximum is between them
|
|
|
|
|
return (left + right) / 2
|
|
|
|
|
|
2024-01-10 15:39:27 +01:00
|
|
|
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
2024-03-15 16:24:45 +01:00
|
|
|
|
"""
|
|
|
|
|
Trains the aggregation function of DyS.
|
|
|
|
|
|
|
|
|
|
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
|
|
|
|
|
as instances, the posterior probabilities issued by the classifier and, as labels, the true labels
|
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
|
|
|
|
"""
|
2024-01-10 15:39:27 +01:00
|
|
|
|
Px, y = classif_predictions.Xy
|
|
|
|
|
Px = Px[:, self.pos_label] # takes only the P(y=+1|x)
|
|
|
|
|
self.Pxy1 = Px[y == self.pos_label]
|
|
|
|
|
self.Pxy0 = Px[y == self.neg_label]
|
2022-07-11 12:21:49 +02:00
|
|
|
|
self.Pxy1_density = np.histogram(self.Pxy1, bins=self.n_bins, range=(0, 1), density=True)[0]
|
|
|
|
|
self.Pxy0_density = np.histogram(self.Pxy0, bins=self.n_bins, range=(0, 1), density=True)[0]
|
2020-12-15 15:20:35 +01:00
|
|
|
|
return self
|
|
|
|
|
|
2022-07-11 12:21:49 +02:00
|
|
|
|
def aggregate(self, classif_posteriors):
|
2024-01-10 15:39:27 +01:00
|
|
|
|
Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x)
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2022-07-11 12:21:49 +02:00
|
|
|
|
Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
|
2023-11-08 16:13:48 +01:00
|
|
|
|
divergence = get_divergence(self.divergence)
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2022-07-11 12:21:49 +02:00
|
|
|
|
def distribution_distance(prev):
|
|
|
|
|
Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
|
2023-01-31 15:08:58 +01:00
|
|
|
|
return divergence(Px_train, Px_test)
|
2022-07-11 12:21:49 +02:00
|
|
|
|
|
|
|
|
|
class1_prev = self._ternary_search(f=distribution_distance, left=0, right=1, tol=self.tol)
|
2024-01-17 09:33:39 +01:00
|
|
|
|
return F.as_binary_prevalence(class1_prev)
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2024-01-10 15:39:27 +01:00
|
|
|
|
class SMM(AggregativeSoftQuantifier, BinaryAggregativeQuantifier):
|
2022-07-11 14:04:28 +02:00
|
|
|
|
"""
|
|
|
|
|
`SMM method <https://ieeexplore.ieee.org/document/9260028>`_ (SMM).
|
|
|
|
|
SMM is a simplification of matching distribution methods where the representation of the examples
|
2024-01-10 15:39:27 +01:00
|
|
|
|
is created using the mean instead of a histogram (conceptually equivalent to PACC).
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
|
:param classifier: a sklearn's Estimator that generates a binary classifier.
|
2022-07-11 14:04:28 +02:00
|
|
|
|
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
|
2024-01-10 15:39:27 +01:00
|
|
|
|
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5)..
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2021-05-05 17:12:44 +02:00
|
|
|
|
|
2024-01-10 15:39:27 +01:00
|
|
|
|
def __init__(self, classifier: BaseEstimator, val_split=5):
|
2023-01-27 18:13:23 +01:00
|
|
|
|
self.classifier = classifier
|
2022-07-11 14:04:28 +02:00
|
|
|
|
self.val_split = val_split
|
|
|
|
|
|
2024-01-10 15:39:27 +01:00
|
|
|
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
2024-03-15 16:24:45 +01:00
|
|
|
|
"""
|
|
|
|
|
Trains the aggregation function of SMM.
|
|
|
|
|
|
|
|
|
|
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
|
|
|
|
|
as instances, the posterior probabilities issued by the classifier and, as labels, the true labels
|
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
|
|
|
|
"""
|
2024-01-10 15:39:27 +01:00
|
|
|
|
Px, y = classif_predictions.Xy
|
|
|
|
|
Px = Px[:, self.pos_label] # takes only the P(y=+1|x)
|
|
|
|
|
self.Pxy1 = Px[y == self.pos_label]
|
|
|
|
|
self.Pxy0 = Px[y == self.neg_label]
|
|
|
|
|
self.Pxy1_mean = np.mean(self.Pxy1) # equiv. TPR
|
|
|
|
|
self.Pxy0_mean = np.mean(self.Pxy0) # equiv. FPR
|
2022-07-11 14:04:28 +02:00
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def aggregate(self, classif_posteriors):
|
2024-01-10 15:39:27 +01:00
|
|
|
|
Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x)
|
2022-07-11 14:04:28 +02:00
|
|
|
|
Px_mean = np.mean(Px)
|
|
|
|
|
|
|
|
|
|
class1_prev = (Px_mean - self.Pxy0_mean)/(self.Pxy1_mean - self.Pxy0_mean)
|
2024-01-17 09:33:39 +01:00
|
|
|
|
return F.as_binary_prevalence(class1_prev, clip_if_necessary=True)
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
|
|
|
|
|
2023-11-16 14:29:34 +01:00
|
|
|
|
class DMy(AggregativeSoftQuantifier):
|
2023-01-31 15:08:58 +01:00
|
|
|
|
"""
|
2023-11-08 16:13:48 +01:00
|
|
|
|
Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of posterior
|
|
|
|
|
probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF
|
|
|
|
|
as hyperparameters.
|
2023-01-31 15:08:58 +01:00
|
|
|
|
|
2023-02-08 19:06:53 +01:00
|
|
|
|
:param classifier: a `sklearn`'s Estimator that generates a probabilistic classifier
|
2023-01-31 15:08:58 +01:00
|
|
|
|
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the
|
|
|
|
|
validation distribution.
|
2024-01-10 15:39:27 +01:00
|
|
|
|
This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
|
2023-01-31 15:08:58 +01:00
|
|
|
|
validation data, or as an integer, indicating that the validation distribution should be estimated via
|
2024-01-10 15:39:27 +01:00
|
|
|
|
`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
|
2023-01-31 15:08:58 +01:00
|
|
|
|
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
|
|
|
|
:param nbins: number of bins used to discretize the distributions (default 8)
|
|
|
|
|
:param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented)
|
|
|
|
|
or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger
|
|
|
|
|
Distance)
|
2023-11-08 16:13:48 +01:00
|
|
|
|
:param cdf: whether to use CDF instead of PDF (default False)
|
2023-01-31 15:08:58 +01:00
|
|
|
|
:param n_jobs: number of parallel workers (default None)
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2023-11-16 14:29:34 +01:00
|
|
|
|
def __init__(self, classifier, val_split=5, nbins=8, divergence: Union[str, Callable]='HD',
|
2023-11-09 18:13:54 +01:00
|
|
|
|
cdf=False, search='optim_minimize', n_jobs=None):
|
2023-01-31 15:08:58 +01:00
|
|
|
|
self.classifier = classifier
|
|
|
|
|
self.val_split = val_split
|
|
|
|
|
self.nbins = nbins
|
|
|
|
|
self.divergence = divergence
|
|
|
|
|
self.cdf = cdf
|
2023-11-09 18:13:54 +01:00
|
|
|
|
self.search = search
|
2023-01-31 15:08:58 +01:00
|
|
|
|
self.n_jobs = n_jobs
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2023-11-16 14:29:34 +01:00
|
|
|
|
# @classmethod
|
2024-01-10 15:39:27 +01:00
|
|
|
|
# def HDy(cls, classifier, val_split=5, n_jobs=None):
|
2023-11-16 14:29:34 +01:00
|
|
|
|
# from quapy.method.meta import MedianEstimator
|
|
|
|
|
#
|
|
|
|
|
# hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD')
|
|
|
|
|
# hdy = AggregativeMedianEstimator(hdy, param_grid={'nbins': np.linspace(10, 110, 11).astype(int)}, n_jobs=n_jobs)
|
|
|
|
|
# return hdy
|
2023-11-09 18:13:54 +01:00
|
|
|
|
|
2023-11-16 14:29:34 +01:00
|
|
|
|
def _get_distributions(self, posteriors):
|
2023-01-31 15:08:58 +01:00
|
|
|
|
histograms = []
|
|
|
|
|
post_dims = posteriors.shape[1]
|
|
|
|
|
if post_dims == 2:
|
|
|
|
|
# in binary quantification we can use only one class, since the other one is its complement
|
|
|
|
|
post_dims = 1
|
|
|
|
|
for dim in range(post_dims):
|
|
|
|
|
hist = np.histogram(posteriors[:, dim], bins=self.nbins, range=(0, 1))[0]
|
|
|
|
|
histograms.append(hist)
|
|
|
|
|
|
|
|
|
|
counts = np.vstack(histograms)
|
|
|
|
|
distributions = counts/counts.sum(axis=1)[:,np.newaxis]
|
|
|
|
|
if self.cdf:
|
|
|
|
|
distributions = np.cumsum(distributions, axis=1)
|
|
|
|
|
return distributions
|
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
2023-01-31 15:08:58 +01:00
|
|
|
|
"""
|
2024-03-15 16:24:45 +01:00
|
|
|
|
Trains the aggregation function of a distribution matching method. This comes down to generating the
|
|
|
|
|
validation distributions out of the training data.
|
2023-01-31 15:08:58 +01:00
|
|
|
|
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
|
2023-11-08 16:13:48 +01:00
|
|
|
|
channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; then `di=V[i]`
|
|
|
|
|
are the distributions obtained from training data labelled with class `i`; while `dij = di[j]` is the discrete
|
2023-01-31 15:08:58 +01:00
|
|
|
|
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
|
|
|
|
|
is the fraction of instances with a value in the `k`-th bin.
|
|
|
|
|
|
2024-03-15 16:24:45 +01:00
|
|
|
|
:param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing,
|
|
|
|
|
as instances, the posterior probabilities issued by the classifier and, as labels, the true labels
|
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
2023-01-31 15:08:58 +01:00
|
|
|
|
"""
|
2023-11-15 10:55:13 +01:00
|
|
|
|
posteriors, true_labels = classif_predictions.Xy
|
2023-11-12 13:04:19 +01:00
|
|
|
|
n_classes = len(self.classifier.classes_)
|
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
self.validation_distribution = qp.util.parallel(
|
2023-11-16 14:29:34 +01:00
|
|
|
|
func=self._get_distributions,
|
2023-11-15 10:55:13 +01:00
|
|
|
|
args=[posteriors[true_labels==cat] for cat in range(n_classes)],
|
2023-11-16 14:29:34 +01:00
|
|
|
|
n_jobs=self.n_jobs,
|
|
|
|
|
backend='threading'
|
2023-01-31 15:08:58 +01:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def aggregate(self, posteriors: np.ndarray):
|
|
|
|
|
"""
|
|
|
|
|
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
|
|
|
|
|
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
|
|
|
|
|
In the multiclass case, with `n` the number of classes, the test and mixture distributions contain
|
|
|
|
|
`n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
|
|
|
|
|
independently. The matching is computed as an average of the divergence across all channels.
|
|
|
|
|
|
2023-11-09 18:13:54 +01:00
|
|
|
|
:param posteriors: posterior probabilities of the instances in the sample
|
2023-01-31 15:08:58 +01:00
|
|
|
|
:return: a vector of class prevalence estimates
|
|
|
|
|
"""
|
2023-11-16 14:29:34 +01:00
|
|
|
|
test_distribution = self._get_distributions(posteriors)
|
2023-11-08 16:13:48 +01:00
|
|
|
|
divergence = get_divergence(self.divergence)
|
2023-01-31 15:08:58 +01:00
|
|
|
|
n_classes, n_channels, nbins = self.validation_distribution.shape
|
2023-11-09 18:13:54 +01:00
|
|
|
|
def loss(prev):
|
2023-01-31 15:08:58 +01:00
|
|
|
|
prev = np.expand_dims(prev, axis=0)
|
|
|
|
|
mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1)
|
|
|
|
|
divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)]
|
|
|
|
|
return np.mean(divs)
|
|
|
|
|
|
2023-11-09 18:13:54 +01:00
|
|
|
|
return F.argmin_prevalence(loss, n_classes, method=self.search)
|
2023-01-31 15:08:58 +01:00
|
|
|
|
|
|
|
|
|
|
2022-07-11 14:04:28 +02:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
def newELM(svmperf_base=None, loss='01', C=1):
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2023-02-13 12:01:52 +01:00
|
|
|
|
Explicit Loss Minimization (ELM) quantifiers.
|
2021-12-15 15:27:43 +01:00
|
|
|
|
Quantifiers based on ELM represent a family of methods based on structured output learning;
|
|
|
|
|
these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
|
|
|
|
|
measure. This implementation relies on
|
|
|
|
|
`Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output
|
|
|
|
|
learning algorithm, which has to be installed and patched for the purpose (see this
|
|
|
|
|
`script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_).
|
2023-02-13 12:01:52 +01:00
|
|
|
|
This function equivalent to:
|
2021-05-05 17:12:44 +02:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
>>> CC(SVMperf(svmperf_base, loss, C))
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
:param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default)
|
|
|
|
|
this path will be obtained from qp.environ['SVMPERF_HOME']
|
2021-12-15 15:27:43 +01:00
|
|
|
|
:param loss: the loss to optimize (see :attr:`quapy.classification.svmperf.SVMperf.valid_losses`)
|
2023-02-13 12:01:52 +01:00
|
|
|
|
:param C: trade-off between training error and margin (default 0.01)
|
|
|
|
|
:return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the
|
|
|
|
|
underlying classifier
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
2023-02-13 12:01:52 +01:00
|
|
|
|
if svmperf_base is None:
|
|
|
|
|
svmperf_base = qp.environ['SVMPERF_HOME']
|
|
|
|
|
assert svmperf_base is not None, \
|
|
|
|
|
'param svmperf_base was not specified, and the variable SVMPERF_HOME has not been set in the environment'
|
|
|
|
|
return CC(SVMperf(svmperf_base, loss=loss, C=C))
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
def newSVMQ(svmperf_base=None, C=1):
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2023-02-13 12:01:52 +01:00
|
|
|
|
SVM(Q) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the `Q` loss combining a
|
|
|
|
|
classification-oriented loss and a quantification-oriented loss, as proposed by
|
2021-12-15 15:27:43 +01:00
|
|
|
|
`Barranquero et al. 2015 <https://www.sciencedirect.com/science/article/pii/S003132031400291X>`_.
|
2022-03-15 14:16:37 +01:00
|
|
|
|
Equivalent to:
|
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
>>> CC(SVMperf(svmperf_base, loss='q', C=C))
|
2021-12-15 15:27:43 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
Quantifiers based on ELM represent a family of methods based on structured output learning;
|
|
|
|
|
these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
|
|
|
|
|
measure. This implementation relies on
|
|
|
|
|
`Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output
|
|
|
|
|
learning algorithm, which has to be installed and patched for the purpose (see this
|
|
|
|
|
`script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_).
|
|
|
|
|
This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
:param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default)
|
|
|
|
|
this path will be obtained from qp.environ['SVMPERF_HOME']
|
|
|
|
|
:param C: trade-off between training error and margin (default 0.01)
|
|
|
|
|
:return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the
|
|
|
|
|
underlying classifier
|
|
|
|
|
"""
|
|
|
|
|
return newELM(svmperf_base, loss='q', C=C)
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
def newSVMKLD(svmperf_base=None, C=1):
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2023-02-13 12:01:52 +01:00
|
|
|
|
SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Kullback-Leibler Divergence
|
|
|
|
|
as proposed by `Esuli et al. 2015 <https://dl.acm.org/doi/abs/10.1145/2700406>`_.
|
2021-12-15 15:27:43 +01:00
|
|
|
|
Equivalent to:
|
2021-05-05 17:12:44 +02:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
>>> CC(SVMperf(svmperf_base, loss='kld', C=C))
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
Quantifiers based on ELM represent a family of methods based on structured output learning;
|
|
|
|
|
these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
|
|
|
|
|
measure. This implementation relies on
|
|
|
|
|
`Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output
|
|
|
|
|
learning algorithm, which has to be installed and patched for the purpose (see this
|
|
|
|
|
`script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_).
|
|
|
|
|
This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
:param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default)
|
|
|
|
|
this path will be obtained from qp.environ['SVMPERF_HOME']
|
|
|
|
|
:param C: trade-off between training error and margin (default 0.01)
|
|
|
|
|
:return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the
|
|
|
|
|
underlying classifier
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2023-02-13 12:01:52 +01:00
|
|
|
|
return newELM(svmperf_base, loss='kld', C=C)
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
def newSVMKLD(svmperf_base=None, C=1):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2023-02-13 12:01:52 +01:00
|
|
|
|
SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Kullback-Leibler Divergence
|
|
|
|
|
normalized via the logistic function, as proposed by
|
2021-12-15 15:27:43 +01:00
|
|
|
|
`Esuli et al. 2015 <https://dl.acm.org/doi/abs/10.1145/2700406>`_.
|
|
|
|
|
Equivalent to:
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
>>> CC(SVMperf(svmperf_base, loss='nkld', C=C))
|
2021-05-05 17:12:44 +02:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
Quantifiers based on ELM represent a family of methods based on structured output learning;
|
|
|
|
|
these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
|
|
|
|
|
measure. This implementation relies on
|
|
|
|
|
`Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output
|
|
|
|
|
learning algorithm, which has to be installed and patched for the purpose (see this
|
|
|
|
|
`script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_).
|
|
|
|
|
This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
:param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default)
|
|
|
|
|
this path will be obtained from qp.environ['SVMPERF_HOME']
|
|
|
|
|
:param C: trade-off between training error and margin (default 0.01)
|
|
|
|
|
:return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the
|
|
|
|
|
underlying classifier
|
|
|
|
|
"""
|
|
|
|
|
return newELM(svmperf_base, loss='nkld', C=C)
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
def newSVMAE(svmperf_base=None, C=1):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2023-02-13 12:01:52 +01:00
|
|
|
|
SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Absolute Error as first used by
|
2022-03-15 14:16:37 +01:00
|
|
|
|
`Moreo and Sebastiani, 2021 <https://arxiv.org/abs/2011.02552>`_.
|
|
|
|
|
Equivalent to:
|
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
>>> CC(SVMperf(svmperf_base, loss='mae', C=C))
|
2021-12-15 15:27:43 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
Quantifiers based on ELM represent a family of methods based on structured output learning;
|
|
|
|
|
these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
|
|
|
|
|
measure. This implementation relies on
|
|
|
|
|
`Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output
|
|
|
|
|
learning algorithm, which has to be installed and patched for the purpose (see this
|
|
|
|
|
`script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_).
|
|
|
|
|
This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
:param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default)
|
|
|
|
|
this path will be obtained from qp.environ['SVMPERF_HOME']
|
|
|
|
|
:param C: trade-off between training error and margin (default 0.01)
|
|
|
|
|
:return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the
|
|
|
|
|
underlying classifier
|
|
|
|
|
"""
|
|
|
|
|
return newELM(svmperf_base, loss='mae', C=C)
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2023-02-13 19:27:48 +01:00
|
|
|
|
def newSVMRAE(svmperf_base=None, C=1):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2023-02-13 12:01:52 +01:00
|
|
|
|
SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Relative Absolute Error as first
|
|
|
|
|
used by `Moreo and Sebastiani, 2021 <https://arxiv.org/abs/2011.02552>`_.
|
2021-12-15 15:27:43 +01:00
|
|
|
|
Equivalent to:
|
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
>>> CC(SVMperf(svmperf_base, loss='mrae', C=C))
|
2021-12-15 15:27:43 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
Quantifiers based on ELM represent a family of methods based on structured output learning;
|
|
|
|
|
these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
|
|
|
|
|
measure. This implementation relies on
|
|
|
|
|
`Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output
|
|
|
|
|
learning algorithm, which has to be installed and patched for the purpose (see this
|
|
|
|
|
`script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_).
|
|
|
|
|
This function is a wrapper around CC(SVMperf(svmperf_base, loss, C))
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
2023-02-13 12:01:52 +01:00
|
|
|
|
:param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default)
|
|
|
|
|
this path will be obtained from qp.environ['SVMPERF_HOME']
|
|
|
|
|
:param C: trade-off between training error and margin (default 0.01)
|
|
|
|
|
:return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the
|
|
|
|
|
underlying classifier
|
|
|
|
|
"""
|
|
|
|
|
return newELM(svmperf_base, loss='mrae', C=C)
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
|
|
|
|
|
2023-02-09 19:39:16 +01:00
|
|
|
|
class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
|
2020-12-10 19:04:33 +01:00
|
|
|
|
"""
|
2022-03-15 14:16:37 +01:00
|
|
|
|
Allows any binary quantifier to perform quantification on single-label datasets.
|
|
|
|
|
The method maintains one binary quantifier for each class, and then l1-normalizes the outputs so that the
|
|
|
|
|
class prevelences sum up to 1.
|
|
|
|
|
This variant was used, along with the :class:`EMQ` quantifier, in
|
|
|
|
|
`Gao and Sebastiani, 2016 <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_.
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
|
:param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a
|
|
|
|
|
one-vs-all manner
|
2022-03-15 14:16:37 +01:00
|
|
|
|
:param n_jobs: number of parallel workers
|
2023-02-09 19:39:16 +01:00
|
|
|
|
:param parallel_backend: the parallel backend for joblib (default "loky"); this is helpful for some quantifiers
|
|
|
|
|
(e.g., ELM-based ones) that cannot be run with multiprocessing, since the temp dir they create during fit will
|
|
|
|
|
is removed and no longer available at predict time.
|
2020-12-10 19:04:33 +01:00
|
|
|
|
"""
|
|
|
|
|
|
2023-02-10 19:02:17 +01:00
|
|
|
|
def __init__(self, binary_quantifier, n_jobs=None, parallel_backend='multiprocessing'):
|
2023-02-09 19:39:16 +01:00
|
|
|
|
assert isinstance(binary_quantifier, BaseQuantifier), \
|
2020-12-11 19:28:17 +01:00
|
|
|
|
f'{self.binary_quantifier} does not seem to be a Quantifier'
|
2023-02-09 19:39:16 +01:00
|
|
|
|
assert isinstance(binary_quantifier, AggregativeQuantifier), \
|
2022-05-26 17:59:23 +02:00
|
|
|
|
f'{self.binary_quantifier} does not seem to be of type Aggregative'
|
2020-12-11 19:28:17 +01:00
|
|
|
|
self.binary_quantifier = binary_quantifier
|
2023-02-08 19:06:53 +01:00
|
|
|
|
self.n_jobs = qp._get_njobs(n_jobs)
|
2023-02-09 19:39:16 +01:00
|
|
|
|
self.parallel_backend = parallel_backend
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2020-12-11 19:28:17 +01:00
|
|
|
|
def classify(self, instances):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
"""
|
2022-05-25 19:14:33 +02:00
|
|
|
|
If the base quantifier is not probabilistic, returns a matrix of shape `(n,m,)` with `n` the number of
|
|
|
|
|
instances and `m` the number of classes. The entry `(i,j)` is a binary value indicating whether instance
|
|
|
|
|
`i `belongs to class `j`. The binary classifications are independent of each other, meaning that an instance
|
|
|
|
|
can end up be attributed to 0, 1, or more classes.
|
|
|
|
|
If the base quantifier is probabilistic, returns a matrix of shape `(n,m,2)` with `n` the number of instances
|
|
|
|
|
and `m` the number of classes. The entry `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the
|
|
|
|
|
posterior probability that instance `i` belongs (resp. does not belong) to class `j`. The posterior
|
|
|
|
|
probabilities are independent of each other, meaning that, in general, they do not sum up to one.
|
2022-03-15 14:16:37 +01:00
|
|
|
|
|
|
|
|
|
:param instances: array-like
|
|
|
|
|
:return: `np.ndarray`
|
|
|
|
|
"""
|
|
|
|
|
|
2023-02-09 19:39:16 +01:00
|
|
|
|
classif_predictions = self._parallel(self._delayed_binary_classification, instances)
|
2023-11-15 10:55:13 +01:00
|
|
|
|
if isinstance(self.binary_quantifier, AggregativeSoftQuantifier):
|
2022-05-25 19:14:33 +02:00
|
|
|
|
return np.swapaxes(classif_predictions, 0, 1)
|
2021-01-18 16:52:19 +01:00
|
|
|
|
else:
|
2022-05-25 19:14:33 +02:00
|
|
|
|
return classif_predictions.T
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2022-05-25 19:14:33 +02:00
|
|
|
|
def aggregate(self, classif_predictions):
|
2023-02-09 19:39:16 +01:00
|
|
|
|
prevalences = self._parallel(self._delayed_binary_aggregate, classif_predictions)
|
2020-12-11 19:28:17 +01:00
|
|
|
|
return F.normalize_prevalence(prevalences)
|
2020-12-10 19:04:33 +01:00
|
|
|
|
|
2020-12-11 19:28:17 +01:00
|
|
|
|
def _delayed_binary_classification(self, c, X):
|
2022-03-15 14:16:37 +01:00
|
|
|
|
return self.dict_binary_quantifiers[c].classify(X)
|
2020-12-10 19:04:33 +01:00
|
|
|
|
|
2020-12-11 19:28:17 +01:00
|
|
|
|
def _delayed_binary_aggregate(self, c, classif_predictions):
|
2021-01-11 12:55:06 +01:00
|
|
|
|
# the estimation for the positive class prevalence
|
|
|
|
|
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2023-11-09 18:13:54 +01:00
|
|
|
|
|
2023-11-15 10:55:13 +01:00
|
|
|
|
class AggregativeMedianEstimator(BinaryQuantifier):
|
|
|
|
|
"""
|
|
|
|
|
This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the
|
|
|
|
|
estimation returned by differently (hyper)parameterized base quantifiers.
|
|
|
|
|
The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions,
|
|
|
|
|
i.e., in cases of binary quantification.
|
|
|
|
|
|
|
|
|
|
:param base_quantifier: the base, binary quantifier
|
|
|
|
|
:param random_state: a seed to be set before fitting any base quantifier (default None)
|
|
|
|
|
:param param_grid: the grid or parameters towards which the median will be computed
|
|
|
|
|
:param n_jobs: number of parllel workes
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self, base_quantifier: AggregativeQuantifier, param_grid: dict, random_state=None, n_jobs=None):
|
|
|
|
|
self.base_quantifier = base_quantifier
|
|
|
|
|
self.param_grid = param_grid
|
|
|
|
|
self.random_state = random_state
|
|
|
|
|
self.n_jobs = qp._get_njobs(n_jobs)
|
|
|
|
|
|
|
|
|
|
def get_params(self, deep=True):
|
|
|
|
|
return self.base_quantifier.get_params(deep)
|
|
|
|
|
|
|
|
|
|
def set_params(self, **params):
|
|
|
|
|
self.base_quantifier.set_params(**params)
|
|
|
|
|
|
|
|
|
|
def _delayed_fit(self, args):
|
|
|
|
|
with qp.util.temp_seed(self.random_state):
|
|
|
|
|
params, training = args
|
|
|
|
|
model = deepcopy(self.base_quantifier)
|
|
|
|
|
model.set_params(**params)
|
|
|
|
|
model.fit(training)
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
def _delayed_fit_classifier(self, args):
|
|
|
|
|
with qp.util.temp_seed(self.random_state):
|
|
|
|
|
print('enter job')
|
|
|
|
|
cls_params, training, kwargs = args
|
|
|
|
|
model = deepcopy(self.base_quantifier)
|
|
|
|
|
model.set_params(**cls_params)
|
|
|
|
|
predictions = model.classifier_fit_predict(training, **kwargs)
|
|
|
|
|
print('exit job')
|
|
|
|
|
return (model, predictions)
|
|
|
|
|
|
|
|
|
|
def _delayed_fit_aggregation(self, args):
|
|
|
|
|
with qp.util.temp_seed(self.random_state):
|
|
|
|
|
((model, predictions), q_params), training = args
|
|
|
|
|
model = deepcopy(model)
|
|
|
|
|
model.set_params(**q_params)
|
|
|
|
|
model.aggregation_fit(predictions, training)
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fit(self, training: LabelledCollection, **kwargs):
|
|
|
|
|
import itertools
|
|
|
|
|
|
|
|
|
|
self._check_binary(training, self.__class__.__name__)
|
|
|
|
|
|
|
|
|
|
if isinstance(self.base_quantifier, AggregativeQuantifier):
|
|
|
|
|
cls_configs, q_configs = qp.model_selection.group_params(self.param_grid)
|
|
|
|
|
|
|
|
|
|
if len(cls_configs) > 1:
|
|
|
|
|
models_preds = qp.util.parallel(
|
|
|
|
|
self._delayed_fit_classifier,
|
|
|
|
|
((params, training, kwargs) for params in cls_configs),
|
|
|
|
|
seed=qp.environ.get('_R_SEED', None),
|
|
|
|
|
n_jobs=self.n_jobs,
|
2023-11-16 14:29:34 +01:00
|
|
|
|
asarray=False,
|
|
|
|
|
backend='threading'
|
2023-11-15 10:55:13 +01:00
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
print('only 1')
|
|
|
|
|
model = self.base_quantifier
|
|
|
|
|
model.set_params(**cls_configs[0])
|
|
|
|
|
predictions = model.classifier_fit_predict(training, **kwargs)
|
|
|
|
|
models_preds = [(model, predictions)]
|
|
|
|
|
|
2023-11-16 14:29:34 +01:00
|
|
|
|
self.models = qp.util.parallel(
|
|
|
|
|
self._delayed_fit_aggregation,
|
|
|
|
|
((setup, training) for setup in itertools.product(models_preds, q_configs)),
|
|
|
|
|
seed=qp.environ.get('_R_SEED', None),
|
|
|
|
|
n_jobs=self.n_jobs,
|
|
|
|
|
backend='threading'
|
|
|
|
|
)
|
2023-11-15 10:55:13 +01:00
|
|
|
|
else:
|
|
|
|
|
configs = qp.model_selection.expand_grid(self.param_grid)
|
|
|
|
|
self.models = qp.util.parallel(
|
|
|
|
|
self._delayed_fit,
|
|
|
|
|
((params, training) for params in configs),
|
|
|
|
|
seed=qp.environ.get('_R_SEED', None),
|
|
|
|
|
n_jobs=self.n_jobs,
|
2023-11-16 14:29:34 +01:00
|
|
|
|
backend='threading'
|
2023-11-15 10:55:13 +01:00
|
|
|
|
)
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def _delayed_predict(self, args):
|
|
|
|
|
model, instances = args
|
|
|
|
|
return model.quantify(instances)
|
|
|
|
|
|
|
|
|
|
def quantify(self, instances):
|
|
|
|
|
prev_preds = qp.util.parallel(
|
|
|
|
|
self._delayed_predict,
|
|
|
|
|
((model, instances) for model in self.models),
|
|
|
|
|
seed=qp.environ.get('_R_SEED', None),
|
|
|
|
|
n_jobs=self.n_jobs,
|
2023-11-16 14:29:34 +01:00
|
|
|
|
backend='threading'
|
2023-11-15 10:55:13 +01:00
|
|
|
|
)
|
|
|
|
|
return np.median(prev_preds, axis=0)
|
|
|
|
|
|
2024-01-25 14:33:41 +01:00
|
|
|
|
|
|
|
|
|
#---------------------------------------------------------------
|
|
|
|
|
# imports
|
|
|
|
|
#---------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
from . import _threshold_optim
|
|
|
|
|
|
|
|
|
|
T50 = _threshold_optim.T50
|
|
|
|
|
MAX = _threshold_optim.MAX
|
|
|
|
|
X = _threshold_optim.X
|
|
|
|
|
MS = _threshold_optim.MS
|
|
|
|
|
MS2 = _threshold_optim.MS2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from . import _kdey
|
|
|
|
|
|
|
|
|
|
KDEyML = _kdey.KDEyML
|
|
|
|
|
KDEyHD = _kdey.KDEyHD
|
|
|
|
|
KDEyCS = _kdey.KDEyCS
|
|
|
|
|
|
2023-11-09 18:13:54 +01:00
|
|
|
|
#---------------------------------------------------------------
|
|
|
|
|
# aliases
|
|
|
|
|
#---------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
ClassifyAndCount = CC
|
|
|
|
|
AdjustedClassifyAndCount = ACC
|
|
|
|
|
ProbabilisticClassifyAndCount = PCC
|
|
|
|
|
ProbabilisticAdjustedClassifyAndCount = PACC
|
|
|
|
|
ExpectationMaximizationQuantifier = EMQ
|
|
|
|
|
DistributionMatchingY = DMy
|
|
|
|
|
SLD = EMQ
|
|
|
|
|
HellingerDistanceY = HDy
|
|
|
|
|
MedianSweep = MS
|
|
|
|
|
MedianSweep2 = MS2
|