2020-12-10 19:04:33 +01:00
|
|
|
|
from abc import abstractmethod
|
2021-01-15 18:32:32 +01:00
|
|
|
|
from copy import deepcopy
|
2022-07-11 12:21:49 +02:00
|
|
|
|
from typing import Callable, Union
|
2021-01-15 18:32:32 +01:00
|
|
|
|
import numpy as np
|
|
|
|
|
from joblib import Parallel, delayed
|
|
|
|
|
from sklearn.base import BaseEstimator
|
|
|
|
|
from sklearn.calibration import CalibratedClassifierCV
|
|
|
|
|
from sklearn.metrics import confusion_matrix
|
2022-05-25 19:14:33 +02:00
|
|
|
|
from sklearn.model_selection import StratifiedKFold, cross_val_predict
|
2021-01-11 12:55:06 +01:00
|
|
|
|
from tqdm import tqdm
|
2021-01-18 10:53:22 +01:00
|
|
|
|
import quapy as qp
|
2021-01-15 18:32:32 +01:00
|
|
|
|
import quapy.functional as F
|
|
|
|
|
from quapy.classification.svmperf import SVMperf
|
|
|
|
|
from quapy.data import LabelledCollection
|
|
|
|
|
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
|
|
|
|
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
# Abstract classes
|
|
|
|
|
# ------------------------------------
|
|
|
|
|
|
|
|
|
|
class AggregativeQuantifier(BaseQuantifier):
|
|
|
|
|
"""
|
|
|
|
|
Abstract class for quantification methods that base their estimations on the aggregation of classification
|
2021-12-15 15:27:43 +01:00
|
|
|
|
results. Aggregative Quantifiers thus implement a :meth:`classify` method and maintain a :attr:`learner` attribute.
|
|
|
|
|
Subclasses of this abstract class must implement the method :meth:`aggregate` which computes the aggregation
|
|
|
|
|
of label predictions. The method :meth:`quantify` comes with a default implementation based on
|
|
|
|
|
:meth:`classify` and :meth:`aggregate`.
|
2020-12-03 18:12:28 +01:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
2021-12-15 15:27:43 +01:00
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
|
|
|
|
"""
|
|
|
|
|
Trains the aggregative quantifier
|
|
|
|
|
|
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
|
|
|
|
:param fit_learner: whether or not to train the learner (default is True). Set to False if the
|
|
|
|
|
learner has been trained outside the quantifier.
|
|
|
|
|
:return: self
|
|
|
|
|
"""
|
|
|
|
|
...
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2020-12-10 19:04:33 +01:00
|
|
|
|
@property
|
|
|
|
|
def learner(self):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Gives access to the classifier
|
|
|
|
|
|
|
|
|
|
:return: the classifier (typically an sklearn's Estimator)
|
|
|
|
|
"""
|
2020-12-10 19:04:33 +01:00
|
|
|
|
return self.learner_
|
|
|
|
|
|
|
|
|
|
@learner.setter
|
2021-12-15 15:27:43 +01:00
|
|
|
|
def learner(self, classifier):
|
|
|
|
|
"""
|
|
|
|
|
Setter for the classifier
|
|
|
|
|
|
|
|
|
|
:param classifier: the classifier
|
|
|
|
|
"""
|
|
|
|
|
self.learner_ = classifier
|
2020-12-10 19:04:33 +01:00
|
|
|
|
|
|
|
|
|
def classify(self, instances):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
2022-05-25 19:14:33 +02:00
|
|
|
|
Provides the label predictions for the given instances. The predictions should respect the format expected by
|
|
|
|
|
:meth:`aggregate`, i.e., posterior probabilities for probabilistic quantifiers, or crisp predictions for
|
|
|
|
|
non-probabilistic quantifiers
|
2021-12-15 15:27:43 +01:00
|
|
|
|
|
|
|
|
|
:param instances: array-like
|
|
|
|
|
:return: np.ndarray of shape `(n_instances,)` with label predictions
|
|
|
|
|
"""
|
2020-12-10 19:04:33 +01:00
|
|
|
|
return self.learner.predict(instances)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def quantify(self, instances):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated
|
|
|
|
|
by the classifier.
|
|
|
|
|
|
|
|
|
|
:param instances: array-like
|
|
|
|
|
:return: `np.ndarray` of shape `(self.n_classes_,)` with class prevalence estimates.
|
|
|
|
|
"""
|
2020-12-11 19:28:17 +01:00
|
|
|
|
classif_predictions = self.classify(instances)
|
2021-01-06 14:58:29 +01:00
|
|
|
|
return self.aggregate(classif_predictions)
|
2020-12-11 19:28:17 +01:00
|
|
|
|
|
|
|
|
|
@abstractmethod
|
2021-12-15 15:27:43 +01:00
|
|
|
|
def aggregate(self, classif_predictions: np.ndarray):
|
|
|
|
|
"""
|
|
|
|
|
Implements the aggregation of label predictions.
|
|
|
|
|
|
|
|
|
|
:param classif_predictions: `np.ndarray` of label predictions
|
|
|
|
|
:return: `np.ndarray` of shape `(self.n_classes_,)` with class prevalence estimates.
|
|
|
|
|
"""
|
|
|
|
|
...
|
2020-12-11 19:28:17 +01:00
|
|
|
|
|
2020-12-03 18:12:28 +01:00
|
|
|
|
def get_params(self, deep=True):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Return the current parameters of the quantifier.
|
|
|
|
|
|
|
|
|
|
:param deep: for compatibility with sklearn
|
|
|
|
|
:return: a dictionary of param-value pairs
|
|
|
|
|
"""
|
|
|
|
|
|
2020-12-03 18:12:28 +01:00
|
|
|
|
return self.learner.get_params()
|
|
|
|
|
|
|
|
|
|
def set_params(self, **parameters):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Set the parameters of the quantifier.
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-12-15 15:27:43 +01:00
|
|
|
|
:param parameters: dictionary of param-value pairs
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
self.learner.set_params(**parameters)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
@property
|
2021-05-04 17:09:13 +02:00
|
|
|
|
def classes_(self):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Class labels, in the same order in which class prevalence values are to be computed.
|
|
|
|
|
This default implementation actually returns the class labels of the learner.
|
|
|
|
|
|
|
|
|
|
:return: array-like
|
|
|
|
|
"""
|
2020-12-03 18:12:28 +01:00
|
|
|
|
return self.learner.classes_
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
|
|
|
|
|
"""
|
|
|
|
|
Abstract class for quantification methods that base their estimations on the aggregation of posterior probabilities
|
|
|
|
|
as returned by a probabilistic classifier. Aggregative Probabilistic Quantifiers thus extend Aggregative
|
2020-12-15 15:20:35 +01:00
|
|
|
|
Quantifiers by implementing a _posterior_probabilities_ method returning values in [0,1] -- the posterior
|
2020-12-03 18:12:28 +01:00
|
|
|
|
probabilities.
|
|
|
|
|
"""
|
|
|
|
|
|
2022-05-25 19:14:33 +02:00
|
|
|
|
def classify(self, instances):
|
2021-01-18 10:53:22 +01:00
|
|
|
|
return self.learner.predict_proba(instances)
|
|
|
|
|
|
2020-12-03 18:12:28 +01:00
|
|
|
|
def set_params(self, **parameters):
|
|
|
|
|
if isinstance(self.learner, CalibratedClassifierCV):
|
2021-05-05 17:12:44 +02:00
|
|
|
|
parameters = {'base_estimator__' + k: v for k, v in parameters.items()}
|
2020-12-03 18:12:28 +01:00
|
|
|
|
self.learner.set_params(**parameters)
|
|
|
|
|
|
2020-12-22 17:43:23 +01:00
|
|
|
|
|
2020-12-03 18:12:28 +01:00
|
|
|
|
# Helper
|
|
|
|
|
# ------------------------------------
|
2022-05-25 19:14:33 +02:00
|
|
|
|
def _ensure_probabilistic(learner):
|
|
|
|
|
if not hasattr(learner, 'predict_proba'):
|
|
|
|
|
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
|
|
|
|
|
f'The learner will be calibrated.')
|
|
|
|
|
learner = CalibratedClassifierCV(learner, cv=5)
|
|
|
|
|
return learner
|
|
|
|
|
|
|
|
|
|
|
2021-12-15 15:27:43 +01:00
|
|
|
|
def _training_helper(learner,
|
|
|
|
|
data: LabelledCollection,
|
|
|
|
|
fit_learner: bool = True,
|
|
|
|
|
ensure_probabilistic=False,
|
|
|
|
|
val_split: Union[LabelledCollection, float] = None):
|
2020-12-03 18:12:28 +01:00
|
|
|
|
"""
|
|
|
|
|
Training procedure common to all Aggregative Quantifiers.
|
2021-12-15 15:27:43 +01:00
|
|
|
|
|
2020-12-03 18:12:28 +01:00
|
|
|
|
:param learner: the learner to be fit
|
|
|
|
|
:param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
|
2020-12-10 19:04:33 +01:00
|
|
|
|
:param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
|
2021-12-15 15:27:43 +01:00
|
|
|
|
learner is not probabilistic, then a CalibratedCV instance of it is trained)
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:param val_split: if specified as a float, indicates the proportion of training instances that will define the
|
2021-12-15 15:27:43 +01:00
|
|
|
|
validation split (e.g., 0.3 for using 30% of the training set as validation data); if specified as a
|
|
|
|
|
LabelledCollection, represents the validation split itself
|
2020-12-03 18:12:28 +01:00
|
|
|
|
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
|
2021-12-15 15:27:43 +01:00
|
|
|
|
or None otherwise) to be used as a validation set for any subsequent parameter fitting
|
2020-12-03 18:12:28 +01:00
|
|
|
|
"""
|
|
|
|
|
if fit_learner:
|
|
|
|
|
if ensure_probabilistic:
|
2022-05-25 19:14:33 +02:00
|
|
|
|
learner = _ensure_probabilistic(learner)
|
2020-12-22 17:43:23 +01:00
|
|
|
|
if val_split is not None:
|
|
|
|
|
if isinstance(val_split, float):
|
|
|
|
|
if not (0 < val_split < 1):
|
|
|
|
|
raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)')
|
2021-05-05 17:12:44 +02:00
|
|
|
|
train, unused = data.split_stratified(train_prop=1 - val_split)
|
2021-06-21 12:55:39 +02:00
|
|
|
|
elif isinstance(val_split, LabelledCollection):
|
2020-12-22 17:43:23 +01:00
|
|
|
|
train = data
|
|
|
|
|
unused = val_split
|
|
|
|
|
else:
|
2021-05-05 17:12:44 +02:00
|
|
|
|
raise ValueError(
|
|
|
|
|
f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split '
|
|
|
|
|
'proportion, or a LabelledCollection indicating the validation split')
|
2020-12-03 18:12:28 +01:00
|
|
|
|
else:
|
|
|
|
|
train, unused = data, None
|
2021-01-18 10:53:22 +01:00
|
|
|
|
|
|
|
|
|
if isinstance(learner, BaseQuantifier):
|
|
|
|
|
learner.fit(train)
|
|
|
|
|
else:
|
2021-06-21 11:13:14 +02:00
|
|
|
|
learner.fit(*train.Xy)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
else:
|
|
|
|
|
if ensure_probabilistic:
|
|
|
|
|
if not hasattr(learner, 'predict_proba'):
|
|
|
|
|
raise AssertionError('error: the learner cannot be calibrated since fit_learner is set to False')
|
2021-06-21 11:13:14 +02:00
|
|
|
|
unused = None
|
2021-06-21 12:55:39 +02:00
|
|
|
|
if isinstance(val_split, LabelledCollection):
|
2021-06-21 11:13:14 +02:00
|
|
|
|
unused = val_split
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
return learner, unused
|
|
|
|
|
|
|
|
|
|
|
2022-05-25 19:14:33 +02:00
|
|
|
|
def cross_generate_predictions(
|
|
|
|
|
data,
|
|
|
|
|
learner,
|
|
|
|
|
val_split,
|
|
|
|
|
probabilistic,
|
|
|
|
|
fit_learner,
|
|
|
|
|
n_jobs
|
|
|
|
|
):
|
|
|
|
|
|
2022-06-14 09:35:39 +02:00
|
|
|
|
n_jobs = qp.get_njobs(n_jobs)
|
|
|
|
|
|
2022-05-25 19:14:33 +02:00
|
|
|
|
if isinstance(val_split, int):
|
|
|
|
|
assert fit_learner == True, \
|
|
|
|
|
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
|
|
|
|
|
|
|
|
|
|
if probabilistic:
|
|
|
|
|
learner = _ensure_probabilistic(learner)
|
|
|
|
|
predict = 'predict_proba'
|
|
|
|
|
else:
|
|
|
|
|
predict = 'predict'
|
|
|
|
|
y_pred = cross_val_predict(learner, *data.Xy, cv=val_split, n_jobs=n_jobs, method=predict)
|
|
|
|
|
class_count = data.counts()
|
|
|
|
|
|
|
|
|
|
# fit the learner on all data
|
|
|
|
|
learner.fit(*data.Xy)
|
2022-06-24 14:20:08 +02:00
|
|
|
|
y = data.y
|
2022-05-25 19:14:33 +02:00
|
|
|
|
classes = data.classes_
|
|
|
|
|
else:
|
|
|
|
|
learner, val_data = _training_helper(
|
|
|
|
|
learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split
|
|
|
|
|
)
|
|
|
|
|
y_pred = learner.predict_proba(val_data.instances) if probabilistic else learner.predict(val_data.instances)
|
|
|
|
|
y = val_data.labels
|
|
|
|
|
classes = val_data.classes_
|
|
|
|
|
class_count = val_data.counts()
|
|
|
|
|
|
|
|
|
|
return learner, y, y_pred, classes, class_count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cross_generate_predictions_depr(
|
|
|
|
|
data,
|
|
|
|
|
learner,
|
|
|
|
|
val_split,
|
|
|
|
|
probabilistic,
|
|
|
|
|
fit_learner,
|
|
|
|
|
method_name=''
|
|
|
|
|
):
|
|
|
|
|
predict = learner.predict_proba if probabilistic else learner.predict
|
|
|
|
|
if isinstance(val_split, int):
|
|
|
|
|
assert fit_learner == True, \
|
|
|
|
|
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False'
|
|
|
|
|
# kFCV estimation of parameters
|
|
|
|
|
y, y_ = [], []
|
|
|
|
|
kfcv = StratifiedKFold(n_splits=val_split)
|
|
|
|
|
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
|
|
|
|
for k, (training_idx, validation_idx) in enumerate(pbar):
|
|
|
|
|
pbar.set_description(f'{method_name}\tfitting fold {k}')
|
|
|
|
|
training = data.sampling_from_index(training_idx)
|
|
|
|
|
validation = data.sampling_from_index(validation_idx)
|
|
|
|
|
learner, val_data = _training_helper(
|
|
|
|
|
learner, training, fit_learner, ensure_probabilistic=probabilistic, val_split=validation
|
|
|
|
|
)
|
|
|
|
|
y_.append(predict(val_data.instances))
|
|
|
|
|
y.append(val_data.labels)
|
|
|
|
|
|
|
|
|
|
y = np.concatenate(y)
|
|
|
|
|
y_ = np.concatenate(y_)
|
|
|
|
|
class_count = data.counts()
|
|
|
|
|
|
|
|
|
|
# fit the learner on all data
|
|
|
|
|
learner, _ = _training_helper(
|
|
|
|
|
learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=None
|
|
|
|
|
)
|
|
|
|
|
classes = data.classes_
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
learner, val_data = _training_helper(
|
|
|
|
|
learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split
|
|
|
|
|
)
|
|
|
|
|
y_ = predict(val_data.instances)
|
|
|
|
|
y = val_data.labels
|
|
|
|
|
classes = val_data.classes_
|
|
|
|
|
class_count = val_data.counts()
|
|
|
|
|
|
|
|
|
|
return learner, y, y_, classes, class_count
|
|
|
|
|
|
2020-12-03 18:12:28 +01:00
|
|
|
|
# Methods
|
|
|
|
|
# ------------------------------------
|
2021-01-07 17:58:48 +01:00
|
|
|
|
class CC(AggregativeQuantifier):
|
2020-12-03 18:12:28 +01:00
|
|
|
|
"""
|
2021-12-15 15:27:43 +01:00
|
|
|
|
The most basic Quantification method. One that simply classifies all instances and counts how many have been
|
|
|
|
|
attributed to each of the classes in order to compute class prevalence estimates.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a classifier
|
2020-12-03 18:12:28 +01:00
|
|
|
|
"""
|
|
|
|
|
|
2021-05-05 17:12:44 +02:00
|
|
|
|
def __init__(self, learner: BaseEstimator):
|
2020-12-03 18:12:28 +01:00
|
|
|
|
self.learner = learner
|
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
2020-12-03 18:12:28 +01:00
|
|
|
|
"""
|
2021-12-15 15:27:43 +01:00
|
|
|
|
Trains the Classify & Count method unless `fit_learner` is False, in which case, the classifier is assumed to
|
|
|
|
|
be already fit and there is nothing else to do.
|
|
|
|
|
|
|
|
|
|
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
|
2020-12-03 18:12:28 +01:00
|
|
|
|
:param fit_learner: if False, the classifier is assumed to be fit
|
|
|
|
|
:return: self
|
|
|
|
|
"""
|
2021-12-15 15:27:43 +01:00
|
|
|
|
self.learner, _ = _training_helper(self.learner, data, fit_learner)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
return self
|
|
|
|
|
|
2021-12-15 15:27:43 +01:00
|
|
|
|
def aggregate(self, classif_predictions: np.ndarray):
|
|
|
|
|
"""
|
|
|
|
|
Computes class prevalence estimates by counting the prevalence of each of the predicted labels.
|
|
|
|
|
|
|
|
|
|
:param classif_predictions: array-like with label predictions
|
|
|
|
|
:return: `np.ndarray` of shape `(self.n_classes_,)` with class prevalence estimates.
|
|
|
|
|
"""
|
2021-05-05 17:12:44 +02:00
|
|
|
|
return F.prevalence_from_labels(classif_predictions, self.classes_)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
class ACC(AggregativeQuantifier):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
`Adjusted Classify & Count <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_,
|
|
|
|
|
the "adjusted" variant of :class:`CC`, that corrects the predictions of CC
|
|
|
|
|
according to the `misclassification rates`.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a classifier
|
|
|
|
|
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
|
|
|
|
|
misclassification rates are to be estimated.
|
|
|
|
|
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
|
|
|
|
|
validation data, or as an integer, indicating that the misclassification rates should be estimated via
|
|
|
|
|
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
|
|
|
|
|
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
|
|
|
|
"""
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2022-06-14 09:35:39 +02:00
|
|
|
|
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None):
|
2020-12-03 18:12:28 +01:00
|
|
|
|
self.learner = learner
|
2021-01-22 18:01:51 +01:00
|
|
|
|
self.val_split = val_split
|
2022-06-14 09:35:39 +02:00
|
|
|
|
self.n_jobs = qp.get_njobs(n_jobs)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-05-05 17:12:44 +02:00
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
2020-12-22 17:43:23 +01:00
|
|
|
|
"""
|
2021-12-15 15:27:43 +01:00
|
|
|
|
Trains a ACC quantifier.
|
|
|
|
|
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:param data: the training set
|
|
|
|
|
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
|
|
|
|
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
2021-12-15 15:27:43 +01:00
|
|
|
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
|
|
|
|
indicating the validation set itself, or an int indicating the number `k` of folds to be used in `k`-fold
|
|
|
|
|
cross validation to estimate the parameters
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:return: self
|
|
|
|
|
"""
|
2022-05-25 19:14:33 +02:00
|
|
|
|
|
2021-01-22 18:01:51 +01:00
|
|
|
|
if val_split is None:
|
|
|
|
|
val_split = self.val_split
|
2021-01-11 12:55:06 +01:00
|
|
|
|
|
2022-05-25 19:14:33 +02:00
|
|
|
|
self.learner, y, y_, classes, class_count = cross_generate_predictions(
|
|
|
|
|
data, self.learner, val_split, probabilistic=False, fit_learner=fit_learner, n_jobs=self.n_jobs
|
|
|
|
|
)
|
2021-01-11 12:55:06 +01:00
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
self.cc = CC(self.learner)
|
2022-05-25 19:14:33 +02:00
|
|
|
|
self.Pte_cond_estim_ = self.getPteCondEstim(data.classes_, y, y_)
|
2021-01-11 12:55:06 +01:00
|
|
|
|
|
2022-05-25 19:14:33 +02:00
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def getPteCondEstim(cls, classes, y, y_):
|
2020-12-03 18:12:28 +01:00
|
|
|
|
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
|
|
|
|
# document that belongs to yj ends up being classified as belonging to yi
|
2022-05-25 19:14:33 +02:00
|
|
|
|
conf = confusion_matrix(y, y_, labels=classes).T
|
|
|
|
|
conf = conf.astype(np.float)
|
|
|
|
|
class_counts = conf.sum(axis=0)
|
|
|
|
|
for i, _ in enumerate(classes):
|
|
|
|
|
if class_counts[i] == 0:
|
|
|
|
|
conf[i, i] = 1
|
|
|
|
|
else:
|
|
|
|
|
conf[:, i] /= class_counts[i]
|
|
|
|
|
return conf
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2020-12-11 19:28:17 +01:00
|
|
|
|
def classify(self, data):
|
|
|
|
|
return self.cc.classify(data)
|
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def aggregate(self, classif_predictions):
|
2020-12-11 19:28:17 +01:00
|
|
|
|
prevs_estim = self.cc.aggregate(classif_predictions)
|
2021-01-07 17:58:48 +01:00
|
|
|
|
return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
|
2020-12-11 19:28:17 +01:00
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def solve_adjustment(cls, PteCondEstim, prevs_estim):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Solves the system linear system :math:`Ax = B` with :math:`A` = `PteCondEstim` and :math:`B` = `prevs_estim`
|
|
|
|
|
|
|
|
|
|
:param PteCondEstim: a `np.ndarray` of shape `(n_classes,n_classes,)` with entry `(i,j)` being the estimate
|
|
|
|
|
of :math:`P(y_i|y_j)`, that is, the probability that an instance that belongs to :math:`y_j` ends up being
|
|
|
|
|
classified as belonging to :math:`y_i`
|
|
|
|
|
:param prevs_estim: a `np.ndarray` of shape `(n_classes,)` with the class prevalence estimates
|
|
|
|
|
:return: an adjusted `np.ndarray` of shape `(n_classes,)` with the corrected class prevalence estimates
|
|
|
|
|
"""
|
2020-12-11 19:28:17 +01:00
|
|
|
|
A = PteCondEstim
|
2020-12-03 18:12:28 +01:00
|
|
|
|
B = prevs_estim
|
|
|
|
|
try:
|
|
|
|
|
adjusted_prevs = np.linalg.solve(A, B)
|
|
|
|
|
adjusted_prevs = np.clip(adjusted_prevs, 0, 1)
|
|
|
|
|
adjusted_prevs /= adjusted_prevs.sum()
|
|
|
|
|
except np.linalg.LinAlgError:
|
|
|
|
|
adjusted_prevs = prevs_estim # no way to adjust them!
|
|
|
|
|
return adjusted_prevs
|
|
|
|
|
|
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
class PCC(AggregativeProbabilisticQuantifier):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
`Probabilistic Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_,
|
|
|
|
|
the probabilistic variant of CC that relies on the posterior probabilities returned by a probabilistic classifier.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a classifier
|
|
|
|
|
"""
|
|
|
|
|
|
2021-01-18 10:53:22 +01:00
|
|
|
|
def __init__(self, learner: BaseEstimator):
|
2020-12-03 18:12:28 +01:00
|
|
|
|
self.learner = learner
|
|
|
|
|
|
2021-01-18 10:53:22 +01:00
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
return self
|
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def aggregate(self, classif_posteriors):
|
2020-12-11 19:28:17 +01:00
|
|
|
|
return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
class PACC(AggregativeProbabilisticQuantifier):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
`Probabilistic Adjusted Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_,
|
|
|
|
|
the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a classifier
|
|
|
|
|
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
|
|
|
|
|
misclassification rates are to be estimated.
|
|
|
|
|
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
|
|
|
|
|
validation data, or as an integer, indicating that the misclassification rates should be estimated via
|
|
|
|
|
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
|
|
|
|
|
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
2022-07-11 14:00:25 +02:00
|
|
|
|
:param n_jobs: number of parallel workers
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2022-06-14 09:35:39 +02:00
|
|
|
|
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None):
|
2020-12-03 18:12:28 +01:00
|
|
|
|
self.learner = learner
|
2021-01-22 18:01:51 +01:00
|
|
|
|
self.val_split = val_split
|
2022-06-14 09:35:39 +02:00
|
|
|
|
self.n_jobs = qp.get_njobs(n_jobs)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-05-05 17:12:44 +02:00
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
2020-12-22 17:43:23 +01:00
|
|
|
|
"""
|
2021-12-15 15:27:43 +01:00
|
|
|
|
Trains a PACC quantifier.
|
|
|
|
|
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:param data: the training set
|
|
|
|
|
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
|
|
|
|
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
|
|
|
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
2021-01-11 12:55:06 +01:00
|
|
|
|
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
|
|
|
|
to estimate the parameters
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:return: self
|
|
|
|
|
"""
|
2022-05-25 19:14:33 +02:00
|
|
|
|
|
2021-01-22 18:01:51 +01:00
|
|
|
|
if val_split is None:
|
|
|
|
|
val_split = self.val_split
|
|
|
|
|
|
2022-05-25 19:14:33 +02:00
|
|
|
|
self.learner, y, y_, classes, class_count = cross_generate_predictions(
|
|
|
|
|
data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs
|
|
|
|
|
)
|
2021-01-11 12:55:06 +01:00
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
self.pcc = PCC(self.learner)
|
2022-05-25 19:14:33 +02:00
|
|
|
|
self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_)
|
|
|
|
|
|
|
|
|
|
return self
|
2021-01-11 12:55:06 +01:00
|
|
|
|
|
2022-05-25 19:14:33 +02:00
|
|
|
|
@classmethod
|
|
|
|
|
def getPteCondEstim(cls, classes, y, y_):
|
2021-01-11 12:55:06 +01:00
|
|
|
|
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
|
|
|
|
# document that belongs to yj ends up being classified as belonging to yi
|
2021-06-21 12:55:39 +02:00
|
|
|
|
n_classes = len(classes)
|
2022-05-25 19:14:33 +02:00
|
|
|
|
confusion = np.eye(n_classes)
|
2021-06-21 12:55:39 +02:00
|
|
|
|
for i, class_ in enumerate(classes):
|
2022-05-25 19:14:33 +02:00
|
|
|
|
idx = y == class_
|
|
|
|
|
if idx.any():
|
|
|
|
|
confusion[i] = y_[idx].mean(axis=0)
|
2021-01-06 14:58:29 +01:00
|
|
|
|
|
2022-05-25 19:14:33 +02:00
|
|
|
|
return confusion.T
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def aggregate(self, classif_posteriors):
|
2020-12-11 19:28:17 +01:00
|
|
|
|
prevs_estim = self.pcc.aggregate(classif_posteriors)
|
2021-01-07 17:58:48 +01:00
|
|
|
|
return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
def classify(self, data):
|
|
|
|
|
return self.pcc.classify(data)
|
|
|
|
|
|
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
class EMQ(AggregativeProbabilisticQuantifier):
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2021-12-15 15:27:43 +01:00
|
|
|
|
`Expectation Maximization for Quantification <https://ieeexplore.ieee.org/abstract/document/6789744>`_ (EMQ),
|
|
|
|
|
aka `Saerens-Latinne-Decaestecker` (SLD) algorithm.
|
|
|
|
|
EMQ consists of using the well-known `Expectation Maximization algorithm` to iteratively update the posterior
|
|
|
|
|
probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via
|
|
|
|
|
maximum-likelihood estimation, in a mutually recursive way, until convergence.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a classifier
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
MAX_ITER = 1000
|
|
|
|
|
EPSILON = 1e-4
|
|
|
|
|
|
2021-01-11 12:55:06 +01:00
|
|
|
|
def __init__(self, learner: BaseEstimator):
|
2020-12-03 18:12:28 +01:00
|
|
|
|
self.learner = learner
|
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
2021-05-05 17:12:44 +02:00
|
|
|
|
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
return self
|
|
|
|
|
|
2020-12-11 19:28:17 +01:00
|
|
|
|
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
2021-01-18 10:53:22 +01:00
|
|
|
|
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
|
|
|
|
|
return priors
|
|
|
|
|
|
|
|
|
|
def predict_proba(self, instances, epsilon=EPSILON):
|
|
|
|
|
classif_posteriors = self.learner.predict_proba(instances)
|
|
|
|
|
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
|
|
|
|
|
return posteriors
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
@classmethod
|
2020-12-15 15:20:35 +01:00
|
|
|
|
def EM(cls, tr_prev, posterior_probabilities, epsilon=EPSILON):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Computes the `Expectation Maximization` routine.
|
|
|
|
|
|
|
|
|
|
:param tr_prev: array-like, the training prevalence
|
|
|
|
|
:param posterior_probabilities: `np.ndarray` of shape `(n_instances, n_classes,)` with the
|
|
|
|
|
posterior probabilities
|
|
|
|
|
:param epsilon: float, the threshold different between two consecutive iterations
|
|
|
|
|
to reach before stopping the loop
|
|
|
|
|
:return: a tuple with the estimated prevalence values (shape `(n_classes,)`) and
|
|
|
|
|
the corrected posterior probabilities (shape `(n_instances, n_classes,)`)
|
|
|
|
|
"""
|
2020-12-03 18:12:28 +01:00
|
|
|
|
Px = posterior_probabilities
|
|
|
|
|
Ptr = np.copy(tr_prev)
|
|
|
|
|
qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence
|
|
|
|
|
|
|
|
|
|
s, converged = 0, False
|
|
|
|
|
qs_prev_ = None
|
2021-01-07 17:58:48 +01:00
|
|
|
|
while not converged and s < EMQ.MAX_ITER:
|
2021-03-11 19:00:40 +01:00
|
|
|
|
# E-step: ps is Ps(y|xi)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
ps_unnormalized = (qs / Ptr) * Px
|
2021-03-11 19:00:40 +01:00
|
|
|
|
ps = ps_unnormalized / ps_unnormalized.sum(axis=1, keepdims=True)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-03-11 19:00:40 +01:00
|
|
|
|
# M-step:
|
2020-12-03 18:12:28 +01:00
|
|
|
|
qs = ps.mean(axis=0)
|
|
|
|
|
|
2021-05-05 17:12:44 +02:00
|
|
|
|
if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s > 10:
|
2020-12-03 18:12:28 +01:00
|
|
|
|
converged = True
|
|
|
|
|
|
|
|
|
|
qs_prev_ = qs
|
2020-12-29 20:33:59 +01:00
|
|
|
|
s += 1
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
if not converged:
|
2021-01-25 09:02:11 +01:00
|
|
|
|
print('[warning] the method has reached the maximum number of iterations; it might have not converged')
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-01-18 10:53:22 +01:00
|
|
|
|
return qs, ps
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
2020-12-10 19:04:33 +01:00
|
|
|
|
"""
|
2021-12-15 15:27:43 +01:00
|
|
|
|
`Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy).
|
|
|
|
|
HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of
|
|
|
|
|
minimizing the divergence (in terms of the Hellinger Distance) between two cumulative distributions of posterior
|
|
|
|
|
probabilities returned by the classifier. One of the distributions is generated from the unlabelled examples and
|
|
|
|
|
the other is generated from a validation set. This latter distribution is defined as a mixture of the
|
|
|
|
|
class-conditional distributions of the posterior probabilities returned for the positive and negative validation
|
|
|
|
|
examples, respectively. The parameters of the mixture thus represent the estimates of the class prevalence values.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a binary classifier
|
|
|
|
|
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
|
|
|
|
|
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
|
2020-12-10 19:04:33 +01:00
|
|
|
|
"""
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-01-22 18:01:51 +01:00
|
|
|
|
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
2020-12-10 19:04:33 +01:00
|
|
|
|
self.learner = learner
|
2021-01-22 18:01:51 +01:00
|
|
|
|
self.val_split = val_split
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-05-05 17:12:44 +02:00
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None):
|
2020-12-22 17:43:23 +01:00
|
|
|
|
"""
|
2021-12-15 15:27:43 +01:00
|
|
|
|
Trains a HDy quantifier.
|
|
|
|
|
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:param data: the training set
|
|
|
|
|
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
|
|
|
|
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
2021-12-15 15:27:43 +01:00
|
|
|
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a
|
|
|
|
|
:class:`quapy.data.base.LabelledCollection` indicating the validation set itself
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:return: self
|
|
|
|
|
"""
|
2021-01-22 18:01:51 +01:00
|
|
|
|
if val_split is None:
|
|
|
|
|
val_split = self.val_split
|
|
|
|
|
|
2020-12-22 17:43:23 +01:00
|
|
|
|
self._check_binary(data, self.__class__.__name__)
|
2021-12-15 15:27:43 +01:00
|
|
|
|
self.learner, validation = _training_helper(
|
2020-12-22 17:43:23 +01:00
|
|
|
|
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
|
2022-05-25 19:14:33 +02:00
|
|
|
|
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
|
2021-05-05 17:12:44 +02:00
|
|
|
|
self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
|
|
|
|
|
self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
|
2021-01-18 16:52:19 +01:00
|
|
|
|
# pre-compute the histogram for positive and negative examples
|
2021-05-05 17:12:44 +02:00
|
|
|
|
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
|
|
|
|
|
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in
|
|
|
|
|
self.bins}
|
|
|
|
|
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in
|
|
|
|
|
self.bins}
|
2020-12-10 19:04:33 +01:00
|
|
|
|
return self
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def aggregate(self, classif_posteriors):
|
2020-12-10 19:04:33 +01:00
|
|
|
|
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
|
|
|
|
|
# and the final estimated a priori probability was taken as the median of these 11 estimates."
|
|
|
|
|
# (González-Castro, et al., 2013).
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-05-05 17:12:44 +02:00
|
|
|
|
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2020-12-10 19:04:33 +01:00
|
|
|
|
prev_estimations = []
|
2021-05-05 17:12:44 +02:00
|
|
|
|
# for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
|
|
|
|
|
# Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
|
|
|
|
|
# Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
|
2021-01-18 16:52:19 +01:00
|
|
|
|
for bins in self.bins:
|
|
|
|
|
Pxy0_density = self.Pxy0_density[bins]
|
|
|
|
|
Pxy1_density = self.Pxy1_density[bins]
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2020-12-10 19:04:33 +01:00
|
|
|
|
Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)
|
|
|
|
|
|
|
|
|
|
prev_selected, min_dist = None, None
|
2021-12-07 17:16:39 +01:00
|
|
|
|
for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
|
2021-05-05 17:12:44 +02:00
|
|
|
|
Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density
|
2021-01-06 14:58:29 +01:00
|
|
|
|
hdy = F.HellingerDistance(Px_train, Px_test)
|
2020-12-10 19:04:33 +01:00
|
|
|
|
if prev_selected is None or hdy < min_dist:
|
|
|
|
|
prev_selected, min_dist = prev, hdy
|
|
|
|
|
prev_estimations.append(prev_selected)
|
|
|
|
|
|
2021-05-05 17:12:44 +02:00
|
|
|
|
class1_prev = np.median(prev_estimations)
|
|
|
|
|
return np.asarray([1 - class1_prev, class1_prev])
|
2020-12-10 19:04:33 +01:00
|
|
|
|
|
|
|
|
|
|
2022-07-11 12:21:49 +02:00
|
|
|
|
class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|
|
|
|
"""
|
|
|
|
|
`DyS framework <https://ojs.aaai.org/index.php/AAAI/article/view/4376>`_ (DyS).
|
|
|
|
|
DyS is a generalization of HDy method, using a Ternary Search in order to find the prevalence that
|
|
|
|
|
minimizes the distance between distributions.
|
|
|
|
|
Details for the ternary search have been got from <https://dl.acm.org/doi/pdf/10.1145/3219819.3220059>
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a binary classifier
|
|
|
|
|
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
|
|
|
|
|
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
|
|
|
|
|
:param n_bins: an int with the number of bins to use to compute the histograms.
|
|
|
|
|
:param distance: an str with a distance already included in the librar (HD or topsoe), of a function
|
|
|
|
|
that computes the distance between two distributions.
|
|
|
|
|
:param tol: a float with the tolerance for the ternary search algorithm.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, learner: BaseEstimator, val_split=0.4, n_bins=8, distance: Union[str, Callable]='HD', tol=1e-05):
|
|
|
|
|
self.learner = learner
|
|
|
|
|
self.val_split = val_split
|
|
|
|
|
self.tol = tol
|
|
|
|
|
self.distance = distance
|
|
|
|
|
self.n_bins = n_bins
|
|
|
|
|
|
|
|
|
|
def _ternary_search(self, f, left, right, tol):
|
|
|
|
|
"""
|
|
|
|
|
Find maximum of unimodal function f() within [left, right]
|
|
|
|
|
"""
|
|
|
|
|
while abs(right - left) >= tol:
|
|
|
|
|
left_third = left + (right - left) / 3
|
|
|
|
|
right_third = right - (right - left) / 3
|
|
|
|
|
|
|
|
|
|
if f(left_third) > f(right_third):
|
|
|
|
|
left = left_third
|
|
|
|
|
else:
|
|
|
|
|
right = right_third
|
|
|
|
|
|
|
|
|
|
# Left and right are the current bounds; the maximum is between them
|
|
|
|
|
return (left + right) / 2
|
|
|
|
|
|
|
|
|
|
def _compute_distance(self, Px_train, Px_test, distance: Union[str, Callable]='HD'):
|
|
|
|
|
if distance=='HD':
|
|
|
|
|
return F.HellingerDistance(Px_train, Px_test)
|
|
|
|
|
elif distance=='topsoe':
|
|
|
|
|
return F.TopsoeDistance(Px_train, Px_test)
|
|
|
|
|
else:
|
|
|
|
|
return distance(Px_train, Px_test)
|
|
|
|
|
|
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None):
|
|
|
|
|
if val_split is None:
|
|
|
|
|
val_split = self.val_split
|
|
|
|
|
|
|
|
|
|
self._check_binary(data, self.__class__.__name__)
|
|
|
|
|
self.learner, validation = _training_helper(
|
|
|
|
|
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
|
|
|
|
|
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
|
|
|
|
|
self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
|
|
|
|
|
self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
|
|
|
|
|
self.Pxy1_density = np.histogram(self.Pxy1, bins=self.n_bins, range=(0, 1), density=True)[0]
|
|
|
|
|
self.Pxy0_density = np.histogram(self.Pxy0, bins=self.n_bins, range=(0, 1), density=True)[0]
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def aggregate(self, classif_posteriors):
|
|
|
|
|
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
|
|
|
|
|
|
|
|
|
|
Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0]
|
|
|
|
|
|
|
|
|
|
def distribution_distance(prev):
|
|
|
|
|
Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density
|
|
|
|
|
return self._compute_distance(Px_train,Px_test,self.distance)
|
|
|
|
|
|
|
|
|
|
class1_prev = self._ternary_search(f=distribution_distance, left=0, right=1, tol=self.tol)
|
|
|
|
|
return np.asarray([1 - class1_prev, class1_prev])
|
|
|
|
|
|
|
|
|
|
|
2022-07-11 14:04:28 +02:00
|
|
|
|
class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|
|
|
|
"""
|
|
|
|
|
`SMM method <https://ieeexplore.ieee.org/document/9260028>`_ (SMM).
|
|
|
|
|
SMM is a simplification of matching distribution methods where the representation of the examples
|
|
|
|
|
is created using the mean instead of a histogram.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a binary classifier.
|
|
|
|
|
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
|
|
|
|
|
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
|
|
|
|
self.learner = learner
|
|
|
|
|
self.val_split = val_split
|
|
|
|
|
|
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None):
|
|
|
|
|
if val_split is None:
|
|
|
|
|
val_split = self.val_split
|
|
|
|
|
|
|
|
|
|
self._check_binary(data, self.__class__.__name__)
|
|
|
|
|
self.learner, validation = _training_helper(
|
|
|
|
|
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
|
|
|
|
|
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
|
|
|
|
|
self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
|
|
|
|
|
self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
|
|
|
|
|
self.Pxy1_mean = np.mean(self.Pxy1)
|
|
|
|
|
self.Pxy0_mean = np.mean(self.Pxy0)
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def aggregate(self, classif_posteriors):
|
|
|
|
|
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
|
|
|
|
|
Px_mean = np.mean(Px)
|
|
|
|
|
|
|
|
|
|
class1_prev = (Px_mean - self.Pxy0_mean)/(self.Pxy1_mean - self.Pxy0_mean)
|
|
|
|
|
class1_prev = np.clip(class1_prev, 0, 1)
|
|
|
|
|
|
|
|
|
|
return np.asarray([1 - class1_prev, class1_prev])
|
|
|
|
|
|
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
class ELM(AggregativeQuantifier, BinaryQuantifier):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Class of Explicit Loss Minimization (ELM) quantifiers.
|
|
|
|
|
Quantifiers based on ELM represent a family of methods based on structured output learning;
|
|
|
|
|
these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss
|
|
|
|
|
measure. This implementation relies on
|
|
|
|
|
`Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output
|
|
|
|
|
learning algorithm, which has to be installed and patched for the purpose (see this
|
|
|
|
|
`script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_).
|
|
|
|
|
|
|
|
|
|
:param svmperf_base: path to the folder containing the binary files of `SVM perf`
|
|
|
|
|
:param loss: the loss to optimize (see :attr:`quapy.classification.svmperf.SVMperf.valid_losses`)
|
|
|
|
|
:param kwargs: rest of SVM perf's parameters
|
|
|
|
|
"""
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2021-02-16 19:38:52 +01:00
|
|
|
|
def __init__(self, svmperf_base=None, loss='01', **kwargs):
|
|
|
|
|
self.svmperf_base = svmperf_base if svmperf_base is not None else qp.environ['SVMPERF_HOME']
|
2020-12-15 15:20:35 +01:00
|
|
|
|
self.loss = loss
|
|
|
|
|
self.kwargs = kwargs
|
2021-01-15 08:33:39 +01:00
|
|
|
|
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs)
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
2020-12-22 17:43:23 +01:00
|
|
|
|
self._check_binary(data, self.__class__.__name__)
|
2020-12-15 15:20:35 +01:00
|
|
|
|
assert fit_learner, 'the method requires that fit_learner=True'
|
2021-01-15 08:33:39 +01:00
|
|
|
|
self.learner.fit(data.instances, data.labels)
|
2020-12-15 15:20:35 +01:00
|
|
|
|
return self
|
|
|
|
|
|
2021-05-05 17:12:44 +02:00
|
|
|
|
def aggregate(self, classif_predictions: np.ndarray):
|
|
|
|
|
return F.prevalence_from_labels(classif_predictions, self.classes_)
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
|
|
|
|
def classify(self, X, y=None):
|
|
|
|
|
return self.learner.predict(X)
|
|
|
|
|
|
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
class SVMQ(ELM):
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2021-12-15 15:27:43 +01:00
|
|
|
|
SVM(Q), which attempts to minimize the `Q` loss combining a classification-oriented loss and a
|
|
|
|
|
quantification-oriented loss, as proposed by
|
|
|
|
|
`Barranquero et al. 2015 <https://www.sciencedirect.com/science/article/pii/S003132031400291X>`_.
|
|
|
|
|
Equivalent to:
|
|
|
|
|
|
|
|
|
|
>>> ELM(svmperf_base, loss='q', **kwargs)
|
|
|
|
|
|
|
|
|
|
:param svmperf_base: path to the folder containing the binary files of `SVM perf`
|
|
|
|
|
:param kwargs: rest of SVM perf's parameters
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2021-05-05 17:12:44 +02:00
|
|
|
|
|
2021-02-16 19:38:52 +01:00
|
|
|
|
def __init__(self, svmperf_base=None, **kwargs):
|
2020-12-15 15:20:35 +01:00
|
|
|
|
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
|
|
|
|
|
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
class SVMKLD(ELM):
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2021-12-15 15:27:43 +01:00
|
|
|
|
SVM(KLD), which attempts to minimize the Kullback-Leibler Divergence as proposed by
|
|
|
|
|
`Esuli et al. 2015 <https://dl.acm.org/doi/abs/10.1145/2700406>`_.
|
|
|
|
|
Equivalent to:
|
|
|
|
|
|
|
|
|
|
>>> ELM(svmperf_base, loss='kld', **kwargs)
|
|
|
|
|
|
|
|
|
|
:param svmperf_base: path to the folder containing the binary files of `SVM perf`
|
|
|
|
|
:param kwargs: rest of SVM perf's parameters
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2021-05-05 17:12:44 +02:00
|
|
|
|
|
2021-02-16 19:38:52 +01:00
|
|
|
|
def __init__(self, svmperf_base=None, **kwargs):
|
2020-12-15 15:20:35 +01:00
|
|
|
|
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
|
|
|
|
|
|
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
class SVMNKLD(ELM):
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2021-12-15 15:27:43 +01:00
|
|
|
|
SVM(NKLD), which attempts to minimize a version of the the Kullback-Leibler Divergence normalized
|
|
|
|
|
via the logistic function, as proposed by
|
|
|
|
|
`Esuli et al. 2015 <https://dl.acm.org/doi/abs/10.1145/2700406>`_.
|
|
|
|
|
Equivalent to:
|
|
|
|
|
|
|
|
|
|
>>> ELM(svmperf_base, loss='nkld', **kwargs)
|
|
|
|
|
|
|
|
|
|
:param svmperf_base: path to the folder containing the binary files of `SVM perf`
|
|
|
|
|
:param kwargs: rest of SVM perf's parameters
|
2021-02-16 19:38:52 +01:00
|
|
|
|
"""
|
2021-05-05 17:12:44 +02:00
|
|
|
|
|
2021-02-16 19:38:52 +01:00
|
|
|
|
def __init__(self, svmperf_base=None, **kwargs):
|
2020-12-15 15:20:35 +01:00
|
|
|
|
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
|
|
|
|
|
|
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
class SVMAE(ELM):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
SVM(AE), which attempts to minimize Absolute Error as first used by
|
|
|
|
|
`Moreo and Sebastiani, 2021 <https://arxiv.org/abs/2011.02552>`_.
|
|
|
|
|
Equivalent to:
|
|
|
|
|
|
|
|
|
|
>>> ELM(svmperf_base, loss='mae', **kwargs)
|
|
|
|
|
|
|
|
|
|
:param svmperf_base: path to the folder containing the binary files of `SVM perf`
|
|
|
|
|
:param kwargs: rest of SVM perf's parameters
|
|
|
|
|
"""
|
|
|
|
|
|
2021-02-16 19:38:52 +01:00
|
|
|
|
def __init__(self, svmperf_base=None, **kwargs):
|
2020-12-15 15:20:35 +01:00
|
|
|
|
super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs)
|
|
|
|
|
|
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
class SVMRAE(ELM):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
SVM(RAE), which attempts to minimize Relative Absolute Error as first used by
|
|
|
|
|
`Moreo and Sebastiani, 2021 <https://arxiv.org/abs/2011.02552>`_.
|
|
|
|
|
Equivalent to:
|
|
|
|
|
|
|
|
|
|
>>> ELM(svmperf_base, loss='mrae', **kwargs)
|
|
|
|
|
|
|
|
|
|
:param svmperf_base: path to the folder containing the binary files of `SVM perf`
|
|
|
|
|
:param kwargs: rest of SVM perf's parameters
|
|
|
|
|
"""
|
|
|
|
|
|
2021-02-16 19:38:52 +01:00
|
|
|
|
def __init__(self, svmperf_base=None, **kwargs):
|
2020-12-15 15:20:35 +01:00
|
|
|
|
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
|
|
|
|
|
|
|
|
|
|
|
2021-06-16 11:45:40 +02:00
|
|
|
|
class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Abstract class of Threshold Optimization variants for :class:`ACC` as proposed by
|
|
|
|
|
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
|
|
|
|
|
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_.
|
|
|
|
|
The goal is to bring improved stability to the denominator of the adjustment.
|
|
|
|
|
The different variants are based on different heuristics for choosing a decision threshold
|
|
|
|
|
that would allow for more true positives and many more false positives, on the grounds this
|
|
|
|
|
would deliver larger denominators.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a classifier
|
|
|
|
|
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
|
|
|
|
|
misclassification rates are to be estimated.
|
|
|
|
|
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
|
|
|
|
|
validation data, or as an integer, indicating that the misclassification rates should be estimated via
|
|
|
|
|
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
|
|
|
|
|
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
|
|
|
|
"""
|
2021-06-16 11:45:40 +02:00
|
|
|
|
|
2022-06-14 09:35:39 +02:00
|
|
|
|
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None):
|
2021-06-16 11:45:40 +02:00
|
|
|
|
self.learner = learner
|
|
|
|
|
self.val_split = val_split
|
2022-06-14 09:35:39 +02:00
|
|
|
|
self.n_jobs = qp.get_njobs(n_jobs)
|
2021-06-16 11:45:40 +02:00
|
|
|
|
|
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
2021-06-16 13:53:54 +02:00
|
|
|
|
self._check_binary(data, "Threshold Optimization")
|
2021-06-16 11:45:40 +02:00
|
|
|
|
|
|
|
|
|
if val_split is None:
|
|
|
|
|
val_split = self.val_split
|
|
|
|
|
|
2022-05-25 19:14:33 +02:00
|
|
|
|
self.learner, y, y_, classes, class_count = cross_generate_predictions(
|
|
|
|
|
data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs
|
|
|
|
|
)
|
2021-06-16 11:45:40 +02:00
|
|
|
|
|
|
|
|
|
self.cc = CC(self.learner)
|
|
|
|
|
|
2022-05-25 19:14:33 +02:00
|
|
|
|
self.tpr, self.fpr = self._optimize_threshold(y, y_)
|
2021-06-16 11:45:40 +02:00
|
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
def _condition(self, tpr, fpr) -> float:
|
|
|
|
|
"""
|
|
|
|
|
Implements the criterion according to which the threshold should be selected.
|
2021-12-15 15:27:43 +01:00
|
|
|
|
This function should return the (float) score to be minimized.
|
|
|
|
|
|
|
|
|
|
:param tpr: float, true positive rate
|
|
|
|
|
:param fpr: float, false positive rate
|
|
|
|
|
:return: float, a score for the given `tpr` and `fpr`
|
2021-06-16 11:45:40 +02:00
|
|
|
|
"""
|
|
|
|
|
...
|
|
|
|
|
|
2021-12-15 15:27:43 +01:00
|
|
|
|
def _optimize_threshold(self, y, probabilities):
|
|
|
|
|
"""
|
|
|
|
|
Seeks for the best `tpr` and `fpr` according to the score obtained at different
|
|
|
|
|
decision thresholds. The scoring function is implemented in function `_condition`.
|
|
|
|
|
|
|
|
|
|
:param y: predicted labels for the validation set (or for the training set via `k`-fold cross validation)
|
|
|
|
|
:param probabilities: array-like with the posterior probabilities
|
|
|
|
|
:return: best `tpr` and `fpr` according to `_condition`
|
|
|
|
|
"""
|
2021-06-16 11:45:40 +02:00
|
|
|
|
best_candidate_threshold_score = None
|
|
|
|
|
best_tpr = 0
|
|
|
|
|
best_fpr = 0
|
|
|
|
|
candidate_thresholds = np.unique(probabilities[:, 1])
|
|
|
|
|
for candidate_threshold in candidate_thresholds:
|
|
|
|
|
y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]]
|
2021-12-15 15:27:43 +01:00
|
|
|
|
TP, FP, FN, TN = self._compute_table(y, y_)
|
|
|
|
|
tpr = self._compute_tpr(TP, FP)
|
|
|
|
|
fpr = self._compute_fpr(FP, TN)
|
2021-06-16 11:45:40 +02:00
|
|
|
|
condition_score = self._condition(tpr, fpr)
|
|
|
|
|
if best_candidate_threshold_score is None or condition_score < best_candidate_threshold_score:
|
|
|
|
|
best_candidate_threshold_score = condition_score
|
|
|
|
|
best_tpr = tpr
|
|
|
|
|
best_fpr = fpr
|
|
|
|
|
|
|
|
|
|
return best_tpr, best_fpr
|
|
|
|
|
|
|
|
|
|
def aggregate(self, classif_predictions):
|
|
|
|
|
prevs_estim = self.cc.aggregate(classif_predictions)
|
|
|
|
|
if self.tpr - self.fpr == 0:
|
|
|
|
|
return prevs_estim
|
|
|
|
|
adjusted_prevs_estim = np.clip((prevs_estim[1] - self.fpr) / (self.tpr - self.fpr), 0, 1)
|
|
|
|
|
adjusted_prevs_estim = np.array((1 - adjusted_prevs_estim, adjusted_prevs_estim))
|
|
|
|
|
return adjusted_prevs_estim
|
|
|
|
|
|
2021-12-15 15:27:43 +01:00
|
|
|
|
def _compute_table(self, y, y_):
|
2021-06-16 11:45:40 +02:00
|
|
|
|
TP = np.logical_and(y == y_, y == self.classes_[1]).sum()
|
|
|
|
|
FP = np.logical_and(y != y_, y == self.classes_[0]).sum()
|
|
|
|
|
FN = np.logical_and(y != y_, y == self.classes_[1]).sum()
|
|
|
|
|
TN = np.logical_and(y == y_, y == self.classes_[0]).sum()
|
|
|
|
|
return TP, FP, FN, TN
|
|
|
|
|
|
2021-12-15 15:27:43 +01:00
|
|
|
|
def _compute_tpr(self, TP, FP):
|
2021-06-16 11:45:40 +02:00
|
|
|
|
if TP + FP == 0:
|
2022-05-25 19:14:33 +02:00
|
|
|
|
return 1
|
2021-06-16 11:45:40 +02:00
|
|
|
|
return TP / (TP + FP)
|
|
|
|
|
|
2021-12-15 15:27:43 +01:00
|
|
|
|
def _compute_fpr(self, FP, TN):
|
2021-06-16 11:45:40 +02:00
|
|
|
|
if FP + TN == 0:
|
|
|
|
|
return 0
|
|
|
|
|
return FP / (FP + TN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class T50(ThresholdOptimization):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Threshold Optimization variant for :class:`ACC` as proposed by
|
|
|
|
|
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
|
|
|
|
|
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
|
|
|
|
|
for the threshold that makes `tpr` cosest to 0.5.
|
|
|
|
|
The goal is to bring improved stability to the denominator of the adjustment.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a classifier
|
|
|
|
|
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
|
|
|
|
|
misclassification rates are to be estimated.
|
|
|
|
|
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
|
|
|
|
|
validation data, or as an integer, indicating that the misclassification rates should be estimated via
|
|
|
|
|
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
|
|
|
|
|
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
|
|
|
|
"""
|
2021-06-16 11:45:40 +02:00
|
|
|
|
|
|
|
|
|
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
|
|
|
|
super().__init__(learner, val_split)
|
|
|
|
|
|
|
|
|
|
def _condition(self, tpr, fpr) -> float:
|
|
|
|
|
return abs(tpr - 0.5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MAX(ThresholdOptimization):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Threshold Optimization variant for :class:`ACC` as proposed by
|
|
|
|
|
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
|
|
|
|
|
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
|
|
|
|
|
for the threshold that maximizes `tpr-fpr`.
|
|
|
|
|
The goal is to bring improved stability to the denominator of the adjustment.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a classifier
|
|
|
|
|
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
|
|
|
|
|
misclassification rates are to be estimated.
|
|
|
|
|
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
|
|
|
|
|
validation data, or as an integer, indicating that the misclassification rates should be estimated via
|
|
|
|
|
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
|
|
|
|
|
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
|
|
|
|
"""
|
2021-06-16 11:45:40 +02:00
|
|
|
|
|
|
|
|
|
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
|
|
|
|
super().__init__(learner, val_split)
|
|
|
|
|
|
|
|
|
|
def _condition(self, tpr, fpr) -> float:
|
|
|
|
|
# MAX strives to maximize (tpr - fpr), which is equivalent to minimize (fpr - tpr)
|
|
|
|
|
return (fpr - tpr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class X(ThresholdOptimization):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Threshold Optimization variant for :class:`ACC` as proposed by
|
|
|
|
|
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
|
|
|
|
|
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
|
|
|
|
|
for the threshold that yields `tpr=1-fpr`.
|
|
|
|
|
The goal is to bring improved stability to the denominator of the adjustment.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a classifier
|
|
|
|
|
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
|
|
|
|
|
misclassification rates are to be estimated.
|
|
|
|
|
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
|
|
|
|
|
validation data, or as an integer, indicating that the misclassification rates should be estimated via
|
|
|
|
|
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
|
|
|
|
|
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
|
|
|
|
"""
|
2021-06-16 11:45:40 +02:00
|
|
|
|
|
|
|
|
|
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
|
|
|
|
super().__init__(learner, val_split)
|
|
|
|
|
|
|
|
|
|
def _condition(self, tpr, fpr) -> float:
|
|
|
|
|
return abs(1 - (tpr + fpr))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MS(ThresholdOptimization):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Median Sweep. Threshold Optimization variant for :class:`ACC` as proposed by
|
|
|
|
|
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
|
|
|
|
|
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that generates
|
|
|
|
|
class prevalence estimates for all decision thresholds and returns the median of them all.
|
|
|
|
|
The goal is to bring improved stability to the denominator of the adjustment.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a classifier
|
|
|
|
|
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
|
|
|
|
|
misclassification rates are to be estimated.
|
|
|
|
|
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
|
|
|
|
|
validation data, or as an integer, indicating that the misclassification rates should be estimated via
|
|
|
|
|
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
|
|
|
|
|
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
|
|
|
|
"""
|
2021-06-16 11:45:40 +02:00
|
|
|
|
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
|
|
|
|
super().__init__(learner, val_split)
|
|
|
|
|
|
2021-06-16 13:53:54 +02:00
|
|
|
|
def _condition(self, tpr, fpr) -> float:
|
|
|
|
|
pass
|
|
|
|
|
|
2021-12-15 15:27:43 +01:00
|
|
|
|
def _optimize_threshold(self, y, probabilities):
|
2021-06-16 11:45:40 +02:00
|
|
|
|
tprs = []
|
|
|
|
|
fprs = []
|
|
|
|
|
candidate_thresholds = np.unique(probabilities[:, 1])
|
|
|
|
|
for candidate_threshold in candidate_thresholds:
|
|
|
|
|
y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]]
|
2021-12-15 15:27:43 +01:00
|
|
|
|
TP, FP, FN, TN = self._compute_table(y, y_)
|
|
|
|
|
tpr = self._compute_tpr(TP, FP)
|
|
|
|
|
fpr = self._compute_fpr(FP, TN)
|
2021-06-16 11:45:40 +02:00
|
|
|
|
tprs.append(tpr)
|
|
|
|
|
fprs.append(fpr)
|
|
|
|
|
return np.median(tprs), np.median(fprs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MS2(MS):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
|
|
|
|
Median Sweep 2. Threshold Optimization variant for :class:`ACC` as proposed by
|
|
|
|
|
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
|
|
|
|
|
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that generates
|
|
|
|
|
class prevalence estimates for all decision thresholds and returns the median of for cases in
|
|
|
|
|
which `tpr-fpr>0.25`
|
|
|
|
|
The goal is to bring improved stability to the denominator of the adjustment.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a classifier
|
|
|
|
|
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
|
|
|
|
|
misclassification rates are to be estimated.
|
|
|
|
|
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
|
|
|
|
|
validation data, or as an integer, indicating that the misclassification rates should be estimated via
|
|
|
|
|
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a
|
|
|
|
|
:class:`quapy.data.base.LabelledCollection` (the split itself).
|
|
|
|
|
"""
|
2021-06-16 11:45:40 +02:00
|
|
|
|
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
|
|
|
|
super().__init__(learner, val_split)
|
|
|
|
|
|
2021-12-15 15:27:43 +01:00
|
|
|
|
def _optimize_threshold(self, y, probabilities):
|
2021-06-16 11:45:40 +02:00
|
|
|
|
tprs = [0, 1]
|
|
|
|
|
fprs = [0, 1]
|
|
|
|
|
candidate_thresholds = np.unique(probabilities[:, 1])
|
|
|
|
|
for candidate_threshold in candidate_thresholds:
|
|
|
|
|
y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]]
|
2021-12-15 15:27:43 +01:00
|
|
|
|
TP, FP, FN, TN = self._compute_table(y, y_)
|
|
|
|
|
tpr = self._compute_tpr(TP, FP)
|
|
|
|
|
fpr = self._compute_fpr(FP, TN)
|
2021-06-16 11:45:40 +02:00
|
|
|
|
if (tpr - fpr) > 0.25:
|
|
|
|
|
tprs.append(tpr)
|
|
|
|
|
fprs.append(fpr)
|
|
|
|
|
return np.median(tprs), np.median(fprs)
|
|
|
|
|
|
|
|
|
|
|
2021-01-07 17:58:48 +01:00
|
|
|
|
ClassifyAndCount = CC
|
|
|
|
|
AdjustedClassifyAndCount = ACC
|
|
|
|
|
ProbabilisticClassifyAndCount = PCC
|
|
|
|
|
ProbabilisticAdjustedClassifyAndCount = PACC
|
|
|
|
|
ExpectationMaximizationQuantifier = EMQ
|
2021-12-15 15:27:43 +01:00
|
|
|
|
SLD = EMQ
|
2021-01-07 17:58:48 +01:00
|
|
|
|
HellingerDistanceY = HDy
|
2021-01-11 12:55:06 +01:00
|
|
|
|
ExplicitLossMinimisation = ELM
|
2021-06-16 11:45:40 +02:00
|
|
|
|
MedianSweep = MS
|
|
|
|
|
MedianSweep2 = MS2
|
2020-12-15 15:20:35 +01:00
|
|
|
|
|
|
|
|
|
|
2020-12-10 19:04:33 +01:00
|
|
|
|
class OneVsAll(AggregativeQuantifier):
|
|
|
|
|
"""
|
2021-12-15 15:27:43 +01:00
|
|
|
|
Allows any binary quantifier to perform quantification on single-label datasets.
|
|
|
|
|
The method maintains one binary quantifier for each class, and then l1-normalizes the outputs so that the
|
|
|
|
|
class prevelences sum up to 1.
|
|
|
|
|
This variant was used, along with the :class:`EMQ` quantifier, in
|
|
|
|
|
`Gao and Sebastiani, 2016 <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_.
|
|
|
|
|
|
|
|
|
|
:param learner: a sklearn's Estimator that generates a binary classifier
|
|
|
|
|
:param n_jobs: number of parallel workers
|
2020-12-10 19:04:33 +01:00
|
|
|
|
"""
|
|
|
|
|
|
2022-06-14 09:35:39 +02:00
|
|
|
|
def __init__(self, binary_quantifier, n_jobs=None):
|
2022-05-26 17:59:23 +02:00
|
|
|
|
assert isinstance(self.binary_quantifier, BaseQuantifier), \
|
|
|
|
|
f'{self.binary_quantifier} does not seem to be a Quantifier'
|
|
|
|
|
assert isinstance(self.binary_quantifier, AggregativeQuantifier), \
|
|
|
|
|
f'{self.binary_quantifier} does not seem to be of type Aggregative'
|
2020-12-11 19:28:17 +01:00
|
|
|
|
self.binary_quantifier = binary_quantifier
|
2022-06-14 09:35:39 +02:00
|
|
|
|
self.n_jobs = qp.get_njobs(n_jobs)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-01-22 18:01:51 +01:00
|
|
|
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
2020-12-11 19:28:17 +01:00
|
|
|
|
assert not data.binary, \
|
|
|
|
|
f'{self.__class__.__name__} expect non-binary data'
|
2022-05-26 17:59:23 +02:00
|
|
|
|
assert fit_learner == True, \
|
|
|
|
|
'fit_learner must be True'
|
2021-01-18 16:52:19 +01:00
|
|
|
|
|
2020-12-11 19:28:17 +01:00
|
|
|
|
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
|
2021-01-06 14:58:29 +01:00
|
|
|
|
self.__parallel(self._delayed_binary_fit, data)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
return self
|
|
|
|
|
|
2020-12-11 19:28:17 +01:00
|
|
|
|
def classify(self, instances):
|
2021-12-15 15:27:43 +01:00
|
|
|
|
"""
|
2022-05-25 19:14:33 +02:00
|
|
|
|
If the base quantifier is not probabilistic, returns a matrix of shape `(n,m,)` with `n` the number of
|
|
|
|
|
instances and `m` the number of classes. The entry `(i,j)` is a binary value indicating whether instance
|
|
|
|
|
`i `belongs to class `j`. The binary classifications are independent of each other, meaning that an instance
|
|
|
|
|
can end up be attributed to 0, 1, or more classes.
|
|
|
|
|
If the base quantifier is probabilistic, returns a matrix of shape `(n,m,2)` with `n` the number of instances
|
|
|
|
|
and `m` the number of classes. The entry `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the
|
|
|
|
|
posterior probability that instance `i` belongs (resp. does not belong) to class `j`. The posterior
|
|
|
|
|
probabilities are independent of each other, meaning that, in general, they do not sum up to one.
|
2021-12-15 15:27:43 +01:00
|
|
|
|
|
|
|
|
|
:param instances: array-like
|
|
|
|
|
:return: `np.ndarray`
|
|
|
|
|
"""
|
|
|
|
|
|
2022-05-25 19:14:33 +02:00
|
|
|
|
classif_predictions = self.__parallel(self._delayed_binary_classification, instances)
|
|
|
|
|
if isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier):
|
|
|
|
|
return np.swapaxes(classif_predictions, 0, 1)
|
2021-01-18 16:52:19 +01:00
|
|
|
|
else:
|
2022-05-25 19:14:33 +02:00
|
|
|
|
return classif_predictions.T
|
|
|
|
|
|
|
|
|
|
def aggregate(self, classif_predictions):
|
|
|
|
|
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions)
|
2020-12-11 19:28:17 +01:00
|
|
|
|
return F.normalize_prevalence(prevalences)
|
|
|
|
|
|
|
|
|
|
def __parallel(self, func, *args, **kwargs):
|
|
|
|
|
return np.asarray(
|
2021-02-17 18:05:22 +01:00
|
|
|
|
# some quantifiers (in particular, ELM-based ones) cannot be run with multiprocess, since the temp dir they
|
|
|
|
|
# create during the fit will be removed and be no longer available for the predict...
|
|
|
|
|
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
2021-06-01 16:07:01 +02:00
|
|
|
|
delayed(func)(c, *args, **kwargs) for c in self.classes_
|
2020-12-03 18:12:28 +01:00
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@property
|
2021-06-01 16:07:01 +02:00
|
|
|
|
def classes_(self):
|
2020-12-11 19:28:17 +01:00
|
|
|
|
return sorted(self.dict_binary_quantifiers.keys())
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
def set_params(self, **parameters):
|
2020-12-11 19:28:17 +01:00
|
|
|
|
self.binary_quantifier.set_params(**parameters)
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
|
|
|
|
def get_params(self, deep=True):
|
2020-12-11 19:28:17 +01:00
|
|
|
|
return self.binary_quantifier.get_params()
|
2020-12-10 19:04:33 +01:00
|
|
|
|
|
2020-12-11 19:28:17 +01:00
|
|
|
|
def _delayed_binary_classification(self, c, X):
|
|
|
|
|
return self.dict_binary_quantifiers[c].classify(X)
|
2020-12-10 19:04:33 +01:00
|
|
|
|
|
2020-12-11 19:28:17 +01:00
|
|
|
|
def _delayed_binary_aggregate(self, c, classif_predictions):
|
2021-01-11 12:55:06 +01:00
|
|
|
|
# the estimation for the positive class prevalence
|
|
|
|
|
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
|
2020-12-03 18:12:28 +01:00
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
def _delayed_binary_fit(self, c, data):
|
2021-05-05 17:12:44 +02:00
|
|
|
|
bindata = LabelledCollection(data.instances, data.labels == c, classes_=[False, True])
|
2021-01-06 14:58:29 +01:00
|
|
|
|
self.dict_binary_quantifiers[c].fit(bindata)
|
2020-12-29 20:33:59 +01:00
|
|
|
|
|