optimization threshold variants fixed

2024-01-18 18:22:22 +01:00 · 2024-01-18 18:22:22 +01:00 · c0d92a2083
parent 9b2470c992
commit c0d92a2083
3 changed files with 27 additions and 17 deletions
--- a/examples/_uci_experiments_checking_optim_threshold_modifications.py
+++ b/examples/_uci_experiments_checking_optim_threshold_modifications.py
@ -15,7 +15,7 @@ import itertools
 import argparse
 from glob import glob
 import pandas as pd
-
+from time import time

 N_JOBS = -1

@ -38,10 +38,11 @@ svmperf_params = {'classifier__C': __C_range}
 def quantification_models():
    yield 'acc', ACC(newLR()), lr_params
    yield 'T50', T50(newLR()), lr_params
-    #yield 'X', X(newLR()), lr_params
-    #yield 'MAX', MAX(newLR()), lr_params
+    yield 'X', X(newLR()), lr_params
+    yield 'MAX', MAX(newLR()), lr_params
    yield 'MS', MS(newLR()), lr_params
-    yield 'MS2', MS2(newLR()), lr_params
+    yield 'MS+', MS(newLR()), lr_params
+    # yield 'MS2', MS2(newLR()), lr_params



@ -115,8 +116,10 @@ if __name__ == '__main__':
    optim_losses = ['mae']
    datasets = qp.datasets.UCI_DATASETS

+    tstart = time()
    models = quantification_models()
    qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS)
+    tend = time()

    # open all results and show
    df = pd.DataFrame(columns=('method', 'dataset', 'mae'))
@ -126,6 +129,6 @@ if __name__ == '__main__':
        dataset = '-'.join(dataset)
        df.loc[i] = [method, dataset, mae]

-    print(df.pivot_table(index='dataset', columns='method', values='mae'))
-
+    print(df.pivot_table(index='dataset', columns='method', values='mae', margins=True))

+    print(f'took {(tend-tstart)}s')
--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -66,7 +66,7 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False):
        return prevalences


-def as_binary_prevalence(positive_prevalence: float, clip_if_necessary=False):
+def as_binary_prevalence(positive_prevalence: Union[float, np.ndarray], clip_if_necessary=False):
    """
    Helper that, given a float representing the prevalence for the positive class, returns a np.ndarray of two
    values representing a binary distribution.
@ -80,7 +80,8 @@ def as_binary_prevalence(positive_prevalence: float, clip_if_necessary=False):
        positive_prevalence = np.clip(positive_prevalence, 0, 1)
    else:
        assert 0 <= positive_prevalence <= 1, 'the value provided is not a valid prevalence for the positive class'
-    return np.asarray([1-positive_prevalence, positive_prevalence])
+    return np.asarray([1-positive_prevalence, positive_prevalence]).T
+


 def HellingerDistance(P, Q) -> float:
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -1102,7 +1102,7 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
        :param fpr: float, false positive rate
        :return: true if the combination is to be discarded, false otherwise
        """
-        return (tpr + fpr) == 0
+        return (tpr - fpr) == 0


    def _eval_candidate_thresholds(self, decision_scores, y):
@ -1119,9 +1119,9 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
        candidates = []
        scores = []
        for candidate_threshold in candidate_thresholds:
-            y_ = self.classes_[1 * (decision_scores > candidate_threshold)]
+            y_ = self.classes_[1 * (decision_scores >= candidate_threshold)]
            TP, FP, FN, TN = self._compute_table(y, y_)
-            tpr = self._compute_tpr(TP, FP)
+            tpr = self._compute_tpr(TP, FN)
            fpr = self._compute_fpr(FP, TN)
            if not self.discard(tpr, fpr):
                candidate_score = self.condition(tpr, fpr)
@ -1139,12 +1139,18 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):

        return candidates

-    def aggregate_with_threshold(self, classif_predictions, tpr, fpr, threshold):
-        prevs_estim = np.mean(classif_predictions > threshold)
-        if tpr - fpr != 0:
-            prevs_estim = (prevs_estim - fpr) / (tpr - fpr)
-        prevs_estim = F.as_binary_prevalence(prevs_estim, clip_if_necessary=True)
-        return prevs_estim
+    # def aggregate_with_threshold(self, classif_predictions, tpr, fpr, threshold):
+    #     prevs_estim = np.mean(classif_predictions >= threshold)
+    #     if tpr - fpr != 0:
+    #         prevs_estim = (prevs_estim - fpr) / (tpr - fpr)
+    #     prevs_estim = F.as_binary_prevalence(prevs_estim, clip_if_necessary=True)
+    #     return prevs_estim
+
+    def aggregate_with_threshold(self, classif_predictions, tprs, fprs, thresholds):
+        prevs_estims = np.mean(classif_predictions[:, None] >= thresholds, axis=0)
+        prevs_estims = (prevs_estims - fprs) / (tprs - fprs)
+        prevs_estims = F.as_binary_prevalence(prevs_estims, clip_if_necessary=True)
+        return prevs_estims.squeeze()

    def _compute_table(self, y, y_):
        TP = np.logical_and(y == y_, y == self.pos_label).sum()