2023-01-18 19:46:19 +01:00
|
|
|
from copy import deepcopy
|
|
|
|
|
|
|
|
from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
|
|
|
|
from sklearn.base import BaseEstimator, clone
|
|
|
|
from sklearn.model_selection import cross_val_predict, train_test_split
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
# Wrappers of calibration defined by Alexandari et al. in paper <http://proceedings.mlr.press/v119/alexandari20a.html>
|
|
|
|
# requires "pip install abstension"
|
|
|
|
# see https://github.com/kundajelab/abstention
|
|
|
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
class RecalibratedProbabilisticClassifier:
|
2023-02-08 19:06:53 +01:00
|
|
|
"""
|
|
|
|
Abstract class for (re)calibration method from `abstention.calibration`, as defined in
|
|
|
|
`Alexandari, A., Kundaje, A., & Shrikumar, A. (2020, November). Maximum likelihood with bias-corrected calibration
|
|
|
|
is hard-to-beat at label shift adaptation. In International Conference on Machine Learning (pp. 222-232). PMLR.
|
|
|
|
<http://proceedings.mlr.press/v119/alexandari20a.html>`_:
|
|
|
|
"""
|
2023-01-18 19:46:19 +01:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabilisticClassifier):
|
2023-01-18 19:46:19 +01:00
|
|
|
"""
|
2023-02-08 19:06:53 +01:00
|
|
|
Applies a (re)calibration method from `abstention.calibration`, as defined in
|
2023-11-15 10:55:13 +01:00
|
|
|
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_.
|
|
|
|
|
2023-01-18 19:46:19 +01:00
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
:param classifier: a scikit-learn probabilistic classifier
|
2023-01-18 19:46:19 +01:00
|
|
|
:param calibrator: the calibration object (an instance of abstention.calibration.CalibratorFactory)
|
2023-01-27 18:13:23 +01:00
|
|
|
:param val_split: indicate an integer k for performing kFCV to obtain the posterior probabilities, or a float p
|
2023-01-18 19:46:19 +01:00
|
|
|
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
|
|
|
|
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
|
2023-02-08 19:06:53 +01:00
|
|
|
training set afterwards. Default value is 5.
|
2023-01-27 18:13:23 +01:00
|
|
|
:param n_jobs: indicate the number of parallel workers (only when val_split is an integer); default=None
|
2023-01-18 19:46:19 +01:00
|
|
|
:param verbose: whether or not to display information in the standard output
|
|
|
|
"""
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
def __init__(self, classifier, calibrator, val_split=5, n_jobs=None, verbose=False):
|
|
|
|
self.classifier = classifier
|
2023-01-18 19:46:19 +01:00
|
|
|
self.calibrator = calibrator
|
|
|
|
self.val_split = val_split
|
|
|
|
self.n_jobs = n_jobs
|
|
|
|
self.verbose = verbose
|
|
|
|
|
|
|
|
def fit(self, X, y):
|
2023-02-08 19:06:53 +01:00
|
|
|
"""
|
|
|
|
Fits the calibration for the probabilistic classifier.
|
|
|
|
|
|
|
|
:param X: array-like of shape `(n_samples, n_features)` with the data instances
|
|
|
|
:param y: array-like of shape `(n_samples,)` with the class labels
|
|
|
|
:return: self
|
|
|
|
"""
|
2023-01-18 19:46:19 +01:00
|
|
|
k = self.val_split
|
|
|
|
if isinstance(k, int):
|
|
|
|
if k < 2:
|
|
|
|
raise ValueError('wrong value for val_split: the number of folds must be > 2')
|
|
|
|
return self.fit_cv(X, y)
|
|
|
|
elif isinstance(k, float):
|
|
|
|
if not (0 < k < 1):
|
|
|
|
raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)')
|
2023-11-12 13:04:19 +01:00
|
|
|
return self.fit_tr_val(X, y)
|
2023-01-18 19:46:19 +01:00
|
|
|
|
|
|
|
def fit_cv(self, X, y):
|
2023-02-08 19:06:53 +01:00
|
|
|
"""
|
|
|
|
Fits the calibration in a cross-validation manner, i.e., it generates posterior probabilities for all
|
|
|
|
training instances via cross-validation, and then retrains the classifier on all training instances.
|
2023-02-09 19:39:16 +01:00
|
|
|
The posterior probabilities thus generated are used for calibrating the outputs of the classifier.
|
2023-02-08 19:06:53 +01:00
|
|
|
|
|
|
|
:param X: array-like of shape `(n_samples, n_features)` with the data instances
|
|
|
|
:param y: array-like of shape `(n_samples,)` with the class labels
|
|
|
|
:return: self
|
|
|
|
"""
|
2023-01-18 19:46:19 +01:00
|
|
|
posteriors = cross_val_predict(
|
2023-01-27 18:13:23 +01:00
|
|
|
self.classifier, X, y, cv=self.val_split, n_jobs=self.n_jobs, verbose=self.verbose, method='predict_proba'
|
2023-01-18 19:46:19 +01:00
|
|
|
)
|
2023-01-27 18:13:23 +01:00
|
|
|
self.classifier.fit(X, y)
|
2023-01-18 19:46:19 +01:00
|
|
|
nclasses = len(np.unique(y))
|
|
|
|
self.calibration_function = self.calibrator(posteriors, np.eye(nclasses)[y], posterior_supplied=True)
|
|
|
|
return self
|
|
|
|
|
|
|
|
def fit_tr_val(self, X, y):
|
2023-02-08 19:06:53 +01:00
|
|
|
"""
|
|
|
|
Fits the calibration in a train/val-split manner, i.e.t, it partitions the training instances into a
|
|
|
|
training and a validation set, and then uses the training samples to learn classifier which is then used
|
|
|
|
to generate posterior probabilities for the held-out validation data. These posteriors are used to calibrate
|
|
|
|
the classifier. The classifier is not retrained on the whole dataset.
|
|
|
|
|
|
|
|
:param X: array-like of shape `(n_samples, n_features)` with the data instances
|
|
|
|
:param y: array-like of shape `(n_samples,)` with the class labels
|
|
|
|
:return: self
|
|
|
|
"""
|
2023-01-18 19:46:19 +01:00
|
|
|
Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=self.val_split, stratify=y)
|
2023-01-27 18:13:23 +01:00
|
|
|
self.classifier.fit(Xtr, ytr)
|
|
|
|
posteriors = self.classifier.predict_proba(Xva)
|
2023-01-18 19:46:19 +01:00
|
|
|
nclasses = len(np.unique(yva))
|
2023-11-12 13:04:19 +01:00
|
|
|
self.calibration_function = self.calibrator(posteriors, np.eye(nclasses)[yva], posterior_supplied=True)
|
2023-01-18 19:46:19 +01:00
|
|
|
return self
|
|
|
|
|
|
|
|
def predict(self, X):
|
2023-02-08 19:06:53 +01:00
|
|
|
"""
|
|
|
|
Predicts class labels for the data instances in `X`
|
|
|
|
|
|
|
|
:param X: array-like of shape `(n_samples, n_features)` with the data instances
|
|
|
|
:return: array-like of shape `(n_samples,)` with the class label predictions
|
|
|
|
"""
|
2023-01-27 18:13:23 +01:00
|
|
|
return self.classifier.predict(X)
|
2023-01-18 19:46:19 +01:00
|
|
|
|
|
|
|
def predict_proba(self, X):
|
2023-02-08 19:06:53 +01:00
|
|
|
"""
|
|
|
|
Generates posterior probabilities for the data instances in `X`
|
|
|
|
|
|
|
|
:param X: array-like of shape `(n_samples, n_features)` with the data instances
|
|
|
|
:return: array-like of shape `(n_samples, n_classes)` with posterior probabilities
|
|
|
|
"""
|
2023-01-27 18:13:23 +01:00
|
|
|
posteriors = self.classifier.predict_proba(X)
|
2023-01-18 19:46:19 +01:00
|
|
|
return self.calibration_function(posteriors)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def classes_(self):
|
2023-02-08 19:06:53 +01:00
|
|
|
"""
|
|
|
|
Returns the classes on which the classifier has been trained on
|
|
|
|
|
|
|
|
:return: array-like of shape `(n_classes)`
|
|
|
|
"""
|
2023-01-27 18:13:23 +01:00
|
|
|
return self.classifier.classes_
|
2023-01-18 19:46:19 +01:00
|
|
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
class NBVSCalibration(RecalibratedProbabilisticClassifierBase):
|
2023-01-18 19:46:19 +01:00
|
|
|
"""
|
2023-02-08 19:06:53 +01:00
|
|
|
Applies the No-Bias Vector Scaling (NBVS) calibration method from `abstention.calibration`, as defined in
|
2023-01-18 19:46:19 +01:00
|
|
|
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
:param classifier: a scikit-learn probabilistic classifier
|
2023-01-18 19:46:19 +01:00
|
|
|
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
|
|
|
|
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
|
|
|
|
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
|
2023-02-08 19:06:53 +01:00
|
|
|
training set afterwards. Default value is 5.
|
2023-01-18 19:46:19 +01:00
|
|
|
:param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
|
|
|
|
:param verbose: whether or not to display information in the standard output
|
|
|
|
"""
|
|
|
|
|
2023-02-08 19:06:53 +01:00
|
|
|
def __init__(self, classifier, val_split=5, n_jobs=None, verbose=False):
|
2023-01-27 18:13:23 +01:00
|
|
|
self.classifier = classifier
|
2023-01-18 19:46:19 +01:00
|
|
|
self.calibrator = NoBiasVectorScaling(verbose=verbose)
|
|
|
|
self.val_split = val_split
|
|
|
|
self.n_jobs = n_jobs
|
|
|
|
self.verbose = verbose
|
|
|
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
class BCTSCalibration(RecalibratedProbabilisticClassifierBase):
|
2023-01-18 19:46:19 +01:00
|
|
|
"""
|
2023-02-08 19:06:53 +01:00
|
|
|
Applies the Bias-Corrected Temperature Scaling (BCTS) calibration method from `abstention.calibration`, as defined in
|
2023-01-18 19:46:19 +01:00
|
|
|
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
:param classifier: a scikit-learn probabilistic classifier
|
2023-01-18 19:46:19 +01:00
|
|
|
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
|
|
|
|
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
|
|
|
|
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
|
2023-02-08 19:06:53 +01:00
|
|
|
training set afterwards. Default value is 5.
|
2023-01-18 19:46:19 +01:00
|
|
|
:param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
|
|
|
|
:param verbose: whether or not to display information in the standard output
|
|
|
|
"""
|
|
|
|
|
2023-02-08 19:06:53 +01:00
|
|
|
def __init__(self, classifier, val_split=5, n_jobs=None, verbose=False):
|
2023-01-27 18:13:23 +01:00
|
|
|
self.classifier = classifier
|
2023-01-18 19:46:19 +01:00
|
|
|
self.calibrator = TempScaling(verbose=verbose, bias_positions='all')
|
|
|
|
self.val_split = val_split
|
|
|
|
self.n_jobs = n_jobs
|
|
|
|
self.verbose = verbose
|
|
|
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
class TSCalibration(RecalibratedProbabilisticClassifierBase):
|
2023-01-18 19:46:19 +01:00
|
|
|
"""
|
2023-02-08 19:06:53 +01:00
|
|
|
Applies the Temperature Scaling (TS) calibration method from `abstention.calibration`, as defined in
|
2023-01-18 19:46:19 +01:00
|
|
|
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
:param classifier: a scikit-learn probabilistic classifier
|
2023-01-18 19:46:19 +01:00
|
|
|
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
|
|
|
|
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
|
|
|
|
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
|
2023-02-08 19:06:53 +01:00
|
|
|
training set afterwards. Default value is 5.
|
2023-01-18 19:46:19 +01:00
|
|
|
:param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
|
|
|
|
:param verbose: whether or not to display information in the standard output
|
|
|
|
"""
|
|
|
|
|
2023-02-08 19:06:53 +01:00
|
|
|
def __init__(self, classifier, val_split=5, n_jobs=None, verbose=False):
|
2023-01-27 18:13:23 +01:00
|
|
|
self.classifier = classifier
|
2023-01-18 19:46:19 +01:00
|
|
|
self.calibrator = TempScaling(verbose=verbose)
|
|
|
|
self.val_split = val_split
|
|
|
|
self.n_jobs = n_jobs
|
|
|
|
self.verbose = verbose
|
|
|
|
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
class VSCalibration(RecalibratedProbabilisticClassifierBase):
|
2023-01-18 19:46:19 +01:00
|
|
|
"""
|
2023-02-08 19:06:53 +01:00
|
|
|
Applies the Vector Scaling (VS) calibration method from `abstention.calibration`, as defined in
|
2023-01-18 19:46:19 +01:00
|
|
|
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
|
|
|
|
|
2023-01-27 18:13:23 +01:00
|
|
|
:param classifier: a scikit-learn probabilistic classifier
|
2023-01-18 19:46:19 +01:00
|
|
|
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
|
|
|
|
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
|
|
|
|
training instances (the rest is used for training). In any case, the classifier is retrained in the whole
|
2023-02-08 19:06:53 +01:00
|
|
|
training set afterwards. Default value is 5.
|
2023-01-18 19:46:19 +01:00
|
|
|
:param n_jobs: indicate the number of parallel workers (only when val_split is an integer)
|
|
|
|
:param verbose: whether or not to display information in the standard output
|
|
|
|
"""
|
|
|
|
|
2023-02-08 19:06:53 +01:00
|
|
|
def __init__(self, classifier, val_split=5, n_jobs=None, verbose=False):
|
2023-01-27 18:13:23 +01:00
|
|
|
self.classifier = classifier
|
2023-01-18 19:46:19 +01:00
|
|
|
self.calibrator = VectorScaling(verbose=verbose)
|
|
|
|
self.val_split = val_split
|
|
|
|
self.n_jobs = n_jobs
|
|
|
|
self.verbose = verbose
|
|
|
|
|