optimization threshold variants fixed

This commit is contained in:
Alejandro Moreo Fernandez 2024-01-18 18:22:22 +01:00
parent 9b2470c992
commit c0d92a2083
3 changed files with 27 additions and 17 deletions

View File

@ -15,7 +15,7 @@ import itertools
import argparse
from glob import glob
import pandas as pd
from time import time
N_JOBS = -1
@ -38,10 +38,11 @@ svmperf_params = {'classifier__C': __C_range}
def quantification_models():
yield 'acc', ACC(newLR()), lr_params
yield 'T50', T50(newLR()), lr_params
#yield 'X', X(newLR()), lr_params
#yield 'MAX', MAX(newLR()), lr_params
yield 'X', X(newLR()), lr_params
yield 'MAX', MAX(newLR()), lr_params
yield 'MS', MS(newLR()), lr_params
yield 'MS2', MS2(newLR()), lr_params
yield 'MS+', MS(newLR()), lr_params
# yield 'MS2', MS2(newLR()), lr_params
@ -115,8 +116,10 @@ if __name__ == '__main__':
optim_losses = ['mae']
datasets = qp.datasets.UCI_DATASETS
tstart = time()
models = quantification_models()
qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS)
tend = time()
# open all results and show
df = pd.DataFrame(columns=('method', 'dataset', 'mae'))
@ -126,6 +129,6 @@ if __name__ == '__main__':
dataset = '-'.join(dataset)
df.loc[i] = [method, dataset, mae]
print(df.pivot_table(index='dataset', columns='method', values='mae'))
print(df.pivot_table(index='dataset', columns='method', values='mae', margins=True))
print(f'took {(tend-tstart)}s')

View File

@ -66,7 +66,7 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False):
return prevalences
def as_binary_prevalence(positive_prevalence: float, clip_if_necessary=False):
def as_binary_prevalence(positive_prevalence: Union[float, np.ndarray], clip_if_necessary=False):
"""
Helper that, given a float representing the prevalence for the positive class, returns a np.ndarray of two
values representing a binary distribution.
@ -80,7 +80,8 @@ def as_binary_prevalence(positive_prevalence: float, clip_if_necessary=False):
positive_prevalence = np.clip(positive_prevalence, 0, 1)
else:
assert 0 <= positive_prevalence <= 1, 'the value provided is not a valid prevalence for the positive class'
return np.asarray([1-positive_prevalence, positive_prevalence])
return np.asarray([1-positive_prevalence, positive_prevalence]).T
def HellingerDistance(P, Q) -> float:

View File

@ -1102,7 +1102,7 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
:param fpr: float, false positive rate
:return: true if the combination is to be discarded, false otherwise
"""
return (tpr + fpr) == 0
return (tpr - fpr) == 0
def _eval_candidate_thresholds(self, decision_scores, y):
@ -1119,9 +1119,9 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
candidates = []
scores = []
for candidate_threshold in candidate_thresholds:
y_ = self.classes_[1 * (decision_scores > candidate_threshold)]
y_ = self.classes_[1 * (decision_scores >= candidate_threshold)]
TP, FP, FN, TN = self._compute_table(y, y_)
tpr = self._compute_tpr(TP, FP)
tpr = self._compute_tpr(TP, FN)
fpr = self._compute_fpr(FP, TN)
if not self.discard(tpr, fpr):
candidate_score = self.condition(tpr, fpr)
@ -1139,12 +1139,18 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
return candidates
def aggregate_with_threshold(self, classif_predictions, tpr, fpr, threshold):
prevs_estim = np.mean(classif_predictions > threshold)
if tpr - fpr != 0:
prevs_estim = (prevs_estim - fpr) / (tpr - fpr)
prevs_estim = F.as_binary_prevalence(prevs_estim, clip_if_necessary=True)
return prevs_estim
# def aggregate_with_threshold(self, classif_predictions, tpr, fpr, threshold):
# prevs_estim = np.mean(classif_predictions >= threshold)
# if tpr - fpr != 0:
# prevs_estim = (prevs_estim - fpr) / (tpr - fpr)
# prevs_estim = F.as_binary_prevalence(prevs_estim, clip_if_necessary=True)
# return prevs_estim
def aggregate_with_threshold(self, classif_predictions, tprs, fprs, thresholds):
prevs_estims = np.mean(classif_predictions[:, None] >= thresholds, axis=0)
prevs_estims = (prevs_estims - fprs) / (tprs - fprs)
prevs_estims = F.as_binary_prevalence(prevs_estims, clip_if_necessary=True)
return prevs_estims.squeeze()
def _compute_table(self, y, y_):
TP = np.logical_and(y == y_, y == self.pos_label).sum()