QuaPy/quapy/method/_threshold_optim.py

from abc import abstractmethod

import numpy as np
from sklearn.base import BaseEstimator
import quapy as qp
import quapy.functional as F
from quapy.data import LabelledCollection
from quapy.method.aggregative import BinaryAggregativeQuantifier


class ThresholdOptimization(BinaryAggregativeQuantifier):
    """
    Abstract class of Threshold Optimization variants for :class:`ACC` as proposed by
    `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
    `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_.
    The goal is to bring improved stability to the denominator of the adjustment.
    The different variants are based on different heuristics for choosing a decision threshold
    that would allow for more true positives and many more false positives, on the grounds this
    would deliver larger denominators.

    :param classifier: a sklearn's Estimator that generates a classifier
    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
        misclassification rates are to be estimated.
        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
        validation data, or as an integer, indicating that the misclassification rates should be estimated via
        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """

    def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None):
        self.classifier = classifier
        self.val_split = val_split
        self.n_jobs = qp._get_njobs(n_jobs)

    @abstractmethod
    def condition(self, tpr, fpr) -> float:
        """
        Implements the criterion according to which the threshold should be selected.
        This function should return the (float) score to be minimized.

        :param tpr: float, true positive rate
        :param fpr: float, false positive rate
        :return: float, a score for the given `tpr` and `fpr`
        """
        ...

    def discard(self, tpr, fpr) -> bool:
        """
        Indicates whether a combination of tpr and fpr should be discarded

        :param tpr: float, true positive rate
        :param fpr: float, false positive rate
        :return: true if the combination is to be discarded, false otherwise
        """
        return (tpr - fpr) == 0


    def _eval_candidate_thresholds(self, decision_scores, y):
        """
        Seeks for the best `tpr` and `fpr` according to the score obtained at different
        decision thresholds. The scoring function is implemented in function `_condition`.

        :param decision_scores: array-like with the classification scores
        :param y: predicted labels for the validation set (or for the training set via `k`-fold cross validation)
        :return: best `tpr` and `fpr` and `threshold` according to `_condition`
        """
        candidate_thresholds = np.unique(decision_scores)

        candidates = []
        scores = []
        for candidate_threshold in candidate_thresholds:
            y_ = self.classes_[1 * (decision_scores >= candidate_threshold)]
            TP, FP, FN, TN = self._compute_table(y, y_)
            tpr = self._compute_tpr(TP, FN)
            fpr = self._compute_fpr(FP, TN)
            if not self.discard(tpr, fpr):
                candidate_score = self.condition(tpr, fpr)
                candidates.append([tpr, fpr, candidate_threshold])
                scores.append(candidate_score)

        if len(candidates) == 0:
            # if no candidate gives rise to a valid combination of tpr and fpr, this method defaults to the standard
            # classify & count; this is akin to assign tpr=1, fpr=0, threshold=0
            tpr, fpr, threshold = 1, 0, 0
            candidates.append([tpr, fpr, threshold])
            scores.append(0)

        candidates = np.asarray(candidates)
        candidates = candidates[np.argsort(scores)]  # sort candidates by candidate_score

        return candidates

    def aggregate_with_threshold(self, classif_predictions, tprs, fprs, thresholds):
        # This function performs the adjusted count for given tpr, fpr, and threshold.
        # Note that, due to broadcasting, tprs, fprs, and thresholds could be arrays of length > 1
        prevs_estims = np.mean(classif_predictions[:, None] >= thresholds, axis=0)
        prevs_estims = (prevs_estims - fprs) / (tprs - fprs)
        prevs_estims = F.as_binary_prevalence(prevs_estims, clip_if_necessary=True)
        return prevs_estims.squeeze()

    def _compute_table(self, y, y_):
        TP = np.logical_and(y == y_, y == self.pos_label).sum()
        FP = np.logical_and(y != y_, y == self.neg_label).sum()
        FN = np.logical_and(y != y_, y == self.pos_label).sum()
        TN = np.logical_and(y == y_, y == self.neg_label).sum()
        return TP, FP, FN, TN

    def _compute_tpr(self, TP, FP):
        if TP + FP == 0:
            return 1
        return TP / (TP + FP)

    def _compute_fpr(self, FP, TN):
        if FP + TN == 0:
            return 0
        return FP / (FP + TN)

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
        decision_scores, y = classif_predictions.Xy
        # the standard behavior is to keep the best threshold only
        self.tpr, self.fpr, self.threshold = self._eval_candidate_thresholds(decision_scores, y)[0]
        return self

    def aggregate(self, classif_predictions: np.ndarray):
        # the standard behavior is to compute the adjusted count using the best threshold found
        return self.aggregate_with_threshold(classif_predictions, self.tpr, self.fpr, self.threshold)


class T50(ThresholdOptimization):
    """
    Threshold Optimization variant for :class:`ACC` as proposed by
    `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
    `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
    for the threshold that makes `tpr` closest to 0.5.
    The goal is to bring improved stability to the denominator of the adjustment.

    :param classifier: a sklearn's Estimator that generates a classifier
    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
        misclassification rates are to be estimated.
        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
        validation data, or as an integer, indicating that the misclassification rates should be estimated via
        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """

    def __init__(self, classifier: BaseEstimator, val_split=5):
        super().__init__(classifier, val_split)

    def condition(self, tpr, fpr) -> float:
        return abs(tpr - 0.5)


class MAX(ThresholdOptimization):
    """
    Threshold Optimization variant for :class:`ACC` as proposed by
    `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
    `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
    for the threshold that maximizes `tpr-fpr`.
    The goal is to bring improved stability to the denominator of the adjustment.

    :param classifier: a sklearn's Estimator that generates a classifier
    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
        misclassification rates are to be estimated.
        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
        validation data, or as an integer, indicating that the misclassification rates should be estimated via
        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """

    def __init__(self, classifier: BaseEstimator, val_split=5):
        super().__init__(classifier, val_split)

    def condition(self, tpr, fpr) -> float:
        # MAX strives to maximize (tpr - fpr), which is equivalent to minimize (fpr - tpr)
        return (fpr - tpr)


class X(ThresholdOptimization):
    """
    Threshold Optimization variant for :class:`ACC` as proposed by
    `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
    `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
    for the threshold that yields `tpr=1-fpr`.
    The goal is to bring improved stability to the denominator of the adjustment.

    :param classifier: a sklearn's Estimator that generates a classifier
    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
        misclassification rates are to be estimated.
        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
        validation data, or as an integer, indicating that the misclassification rates should be estimated via
        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """

    def __init__(self, classifier: BaseEstimator, val_split=5):
        super().__init__(classifier, val_split)

    def condition(self, tpr, fpr) -> float:
        return abs(1 - (tpr + fpr))


class MS(ThresholdOptimization):
    """
    Median Sweep. Threshold Optimization variant for :class:`ACC` as proposed by
    `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
    `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that generates
    class prevalence estimates for all decision thresholds and returns the median of them all.
    The goal is to bring improved stability to the denominator of the adjustment.

    :param classifier: a sklearn's Estimator that generates a classifier
    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
        misclassification rates are to be estimated.
        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
        validation data, or as an integer, indicating that the misclassification rates should be estimated via
        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """
    def __init__(self, classifier: BaseEstimator, val_split=5):
        super().__init__(classifier, val_split)

    def condition(self, tpr, fpr) -> float:
        return 1

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
        decision_scores, y = classif_predictions.Xy
        # keeps all candidates
        tprs_fprs_thresholds = self._eval_candidate_thresholds(decision_scores, y)
        self.tprs = tprs_fprs_thresholds[:, 0]
        self.fprs = tprs_fprs_thresholds[:, 1]
        self.thresholds = tprs_fprs_thresholds[:, 2]
        return self

    def aggregate(self, classif_predictions: np.ndarray):
        prevalences = self.aggregate_with_threshold(classif_predictions, self.tprs, self.fprs, self.thresholds)
        if prevalences.ndim==2:
            prevalences = np.median(prevalences, axis=0)
        return prevalences


class MS2(MS):
    """
    Median Sweep 2. Threshold Optimization variant for :class:`ACC` as proposed by
    `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
    `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that generates
    class prevalence estimates for all decision thresholds and returns the median of for cases in
    which `tpr-fpr>0.25`
    The goal is to bring improved stability to the denominator of the adjustment.

    :param classifier: a sklearn's Estimator that generates a classifier
    :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
        misclassification rates are to be estimated.
        This parameter can be indicated as a real value (between 0 and 1), representing a proportion of
        validation data, or as an integer, indicating that the misclassification rates should be estimated via
        `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
        :class:`quapy.data.base.LabelledCollection` (the split itself).
    """
    def __init__(self, classifier: BaseEstimator, val_split=5):
        super().__init__(classifier, val_split)

    def discard(self, tpr, fpr) -> bool:
        return (tpr-fpr) <= 0.25
merged 2024-02-07 18:45:42 +01:00			`from abc import abstractmethod`

			`import numpy as np`
			`from sklearn.base import BaseEstimator`
			`import quapy as qp`
			`import quapy.functional as F`
			`from quapy.data import LabelledCollection`
			`from quapy.method.aggregative import BinaryAggregativeQuantifier`


			`class ThresholdOptimization(BinaryAggregativeQuantifier):`
			`"""`
			Abstract class of Threshold Optimization variants for :class:`ACC` as proposed by
			`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
			`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_.
			`The goal is to bring improved stability to the denominator of the adjustment.`
			`The different variants are based on different heuristics for choosing a decision threshold`
			`that would allow for more true positives and many more false positives, on the grounds this`
			`would deliver larger denominators.`

			`:param classifier: a sklearn's Estimator that generates a classifier`
			`:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the`
			`misclassification rates are to be estimated.`
			`This parameter can be indicated as a real value (between 0 and 1), representing a proportion of`
			`validation data, or as an integer, indicating that the misclassification rates should be estimated via`
			`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
			:class:`quapy.data.base.LabelledCollection` (the split itself).
			`"""`

			`def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None):`
			`self.classifier = classifier`
			`self.val_split = val_split`
			`self.n_jobs = qp._get_njobs(n_jobs)`

			`@abstractmethod`
			`def condition(self, tpr, fpr) -> float:`
			`"""`
			`Implements the criterion according to which the threshold should be selected.`
			`This function should return the (float) score to be minimized.`

			`:param tpr: float, true positive rate`
			`:param fpr: float, false positive rate`
			:return: float, a score for the given `tpr` and `fpr`
			`"""`
			`...`

			`def discard(self, tpr, fpr) -> bool:`
			`"""`
			`Indicates whether a combination of tpr and fpr should be discarded`

			`:param tpr: float, true positive rate`
			`:param fpr: float, false positive rate`
			`:return: true if the combination is to be discarded, false otherwise`
			`"""`
			`return (tpr - fpr) == 0`


			`def _eval_candidate_thresholds(self, decision_scores, y):`
			`"""`
			Seeks for the best `tpr` and `fpr` according to the score obtained at different
			decision thresholds. The scoring function is implemented in function `_condition`.

			`:param decision_scores: array-like with the classification scores`
			:param y: predicted labels for the validation set (or for the training set via `k`-fold cross validation)
			:return: best `tpr` and `fpr` and `threshold` according to `_condition`
			`"""`
			`candidate_thresholds = np.unique(decision_scores)`

			`candidates = []`
			`scores = []`
			`for candidate_threshold in candidate_thresholds:`
			`y_ = self.classes_[1 * (decision_scores >= candidate_threshold)]`
			`TP, FP, FN, TN = self._compute_table(y, y_)`
			`tpr = self._compute_tpr(TP, FN)`
			`fpr = self._compute_fpr(FP, TN)`
			`if not self.discard(tpr, fpr):`
			`candidate_score = self.condition(tpr, fpr)`
			`candidates.append([tpr, fpr, candidate_threshold])`
			`scores.append(candidate_score)`

			`if len(candidates) == 0:`
			`# if no candidate gives rise to a valid combination of tpr and fpr, this method defaults to the standard`
			`# classify & count; this is akin to assign tpr=1, fpr=0, threshold=0`
			`tpr, fpr, threshold = 1, 0, 0`
			`candidates.append([tpr, fpr, threshold])`
			`scores.append(0)`

			`candidates = np.asarray(candidates)`
			`candidates = candidates[np.argsort(scores)] # sort candidates by candidate_score`

			`return candidates`

			`def aggregate_with_threshold(self, classif_predictions, tprs, fprs, thresholds):`
			`# This function performs the adjusted count for given tpr, fpr, and threshold.`
			`# Note that, due to broadcasting, tprs, fprs, and thresholds could be arrays of length > 1`
			`prevs_estims = np.mean(classif_predictions[:, None] >= thresholds, axis=0)`
			`prevs_estims = (prevs_estims - fprs) / (tprs - fprs)`
			`prevs_estims = F.as_binary_prevalence(prevs_estims, clip_if_necessary=True)`
			`return prevs_estims.squeeze()`

			`def _compute_table(self, y, y_):`
			`TP = np.logical_and(y == y_, y == self.pos_label).sum()`
			`FP = np.logical_and(y != y_, y == self.neg_label).sum()`
			`FN = np.logical_and(y != y_, y == self.pos_label).sum()`
			`TN = np.logical_and(y == y_, y == self.neg_label).sum()`
			`return TP, FP, FN, TN`

			`def _compute_tpr(self, TP, FP):`
			`if TP + FP == 0:`
			`return 1`
			`return TP / (TP + FP)`

			`def _compute_fpr(self, FP, TN):`
			`if FP + TN == 0:`
			`return 0`
			`return FP / (FP + TN)`

			`def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):`
			`decision_scores, y = classif_predictions.Xy`
			`# the standard behavior is to keep the best threshold only`
			`self.tpr, self.fpr, self.threshold = self._eval_candidate_thresholds(decision_scores, y)[0]`
			`return self`

			`def aggregate(self, classif_predictions: np.ndarray):`
			`# the standard behavior is to compute the adjusted count using the best threshold found`
			`return self.aggregate_with_threshold(classif_predictions, self.tpr, self.fpr, self.threshold)`


			`class T50(ThresholdOptimization):`
			`"""`
			Threshold Optimization variant for :class:`ACC` as proposed by
			`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
			`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
			for the threshold that makes `tpr` closest to 0.5.
			`The goal is to bring improved stability to the denominator of the adjustment.`

			`:param classifier: a sklearn's Estimator that generates a classifier`
			`:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the`
			`misclassification rates are to be estimated.`
			`This parameter can be indicated as a real value (between 0 and 1), representing a proportion of`
			`validation data, or as an integer, indicating that the misclassification rates should be estimated via`
			`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
			:class:`quapy.data.base.LabelledCollection` (the split itself).
			`"""`

			`def __init__(self, classifier: BaseEstimator, val_split=5):`
			`super().__init__(classifier, val_split)`

			`def condition(self, tpr, fpr) -> float:`
			`return abs(tpr - 0.5)`


			`class MAX(ThresholdOptimization):`
			`"""`
			Threshold Optimization variant for :class:`ACC` as proposed by
			`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
			`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
			for the threshold that maximizes `tpr-fpr`.
			`The goal is to bring improved stability to the denominator of the adjustment.`

			`:param classifier: a sklearn's Estimator that generates a classifier`
			`:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the`
			`misclassification rates are to be estimated.`
			`This parameter can be indicated as a real value (between 0 and 1), representing a proportion of`
			`validation data, or as an integer, indicating that the misclassification rates should be estimated via`
			`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
			:class:`quapy.data.base.LabelledCollection` (the split itself).
			`"""`

			`def __init__(self, classifier: BaseEstimator, val_split=5):`
			`super().__init__(classifier, val_split)`

			`def condition(self, tpr, fpr) -> float:`
			`# MAX strives to maximize (tpr - fpr), which is equivalent to minimize (fpr - tpr)`
			`return (fpr - tpr)`


			`class X(ThresholdOptimization):`
			`"""`
			Threshold Optimization variant for :class:`ACC` as proposed by
			`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
			`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
			for the threshold that yields `tpr=1-fpr`.
			`The goal is to bring improved stability to the denominator of the adjustment.`

			`:param classifier: a sklearn's Estimator that generates a classifier`
			`:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the`
			`misclassification rates are to be estimated.`
			`This parameter can be indicated as a real value (between 0 and 1), representing a proportion of`
			`validation data, or as an integer, indicating that the misclassification rates should be estimated via`
			`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
			:class:`quapy.data.base.LabelledCollection` (the split itself).
			`"""`

			`def __init__(self, classifier: BaseEstimator, val_split=5):`
			`super().__init__(classifier, val_split)`

			`def condition(self, tpr, fpr) -> float:`
			`return abs(1 - (tpr + fpr))`


			`class MS(ThresholdOptimization):`
			`"""`
			Median Sweep. Threshold Optimization variant for :class:`ACC` as proposed by
			`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
			`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that generates
			`class prevalence estimates for all decision thresholds and returns the median of them all.`
			`The goal is to bring improved stability to the denominator of the adjustment.`

			`:param classifier: a sklearn's Estimator that generates a classifier`
			`:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the`
			`misclassification rates are to be estimated.`
			`This parameter can be indicated as a real value (between 0 and 1), representing a proportion of`
			`validation data, or as an integer, indicating that the misclassification rates should be estimated via`
			`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
			:class:`quapy.data.base.LabelledCollection` (the split itself).
			`"""`
			`def __init__(self, classifier: BaseEstimator, val_split=5):`
			`super().__init__(classifier, val_split)`

			`def condition(self, tpr, fpr) -> float:`
			`return 1`

			`def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):`
			`decision_scores, y = classif_predictions.Xy`
			`# keeps all candidates`
			`tprs_fprs_thresholds = self._eval_candidate_thresholds(decision_scores, y)`
			`self.tprs = tprs_fprs_thresholds[:, 0]`
			`self.fprs = tprs_fprs_thresholds[:, 1]`
			`self.thresholds = tprs_fprs_thresholds[:, 2]`
			`return self`

			`def aggregate(self, classif_predictions: np.ndarray):`
			`prevalences = self.aggregate_with_threshold(classif_predictions, self.tprs, self.fprs, self.thresholds)`
			`if prevalences.ndim==2:`
			`prevalences = np.median(prevalences, axis=0)`
			`return prevalences`


			`class MS2(MS):`
			`"""`
			Median Sweep 2. Threshold Optimization variant for :class:`ACC` as proposed by
			`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
			`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that generates
			`class prevalence estimates for all decision thresholds and returns the median of for cases in`
			which `tpr-fpr>0.25`
			`The goal is to bring improved stability to the denominator of the adjustment.`

			`:param classifier: a sklearn's Estimator that generates a classifier`
			`:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the`
			`misclassification rates are to be estimated.`
			`This parameter can be indicated as a real value (between 0 and 1), representing a proportion of`
			`validation data, or as an integer, indicating that the misclassification rates should be estimated via`
			`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
			:class:`quapy.data.base.LabelledCollection` (the split itself).
			`"""`
			`def __init__(self, classifier: BaseEstimator, val_split=5):`
			`super().__init__(classifier, val_split)`

			`def discard(self, tpr, fpr) -> bool:`
			`return (tpr-fpr) <= 0.25`