From eb860e9678c396d5ce5bcba703fb8e36a4ad0403 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 12 Dec 2022 09:34:09 +0100 Subject: [PATCH] adding the possibility to estimate the training prevalence, instead of using the true training prevalence, as a starting point in emq --- examples/lequa2022_experiments.py | 13 ++++++++++--- quapy/method/aggregative.py | 12 ++++++++++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/examples/lequa2022_experiments.py b/examples/lequa2022_experiments.py index 0df7d15..31ec651 100644 --- a/examples/lequa2022_experiments.py +++ b/examples/lequa2022_experiments.py @@ -1,6 +1,8 @@ import numpy as np +from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression import quapy as qp +import quapy.functional as F from data.datasets import LEQUA2022_SAMPLE_SIZE, fetch_lequa2022 from evaluation import evaluation_report from method.aggregative import EMQ @@ -14,7 +16,8 @@ qp.environ['SAMPLE_SIZE'] = LEQUA2022_SAMPLE_SIZE[task] training, val_generator, test_generator = fetch_lequa2022(task=task) # define the quantifier -quantifier = EMQ(learner=LogisticRegression()) +learner = CalibratedClassifierCV(LogisticRegression()) +quantifier = EMQ(learner=learner) # model selection param_grid = {'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]} @@ -24,6 +27,10 @@ quantifier = model_selection.fit(training) # evaluation report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True) -pd.set_option('display.max_columns', None) -pd.set_option('display.width', 1000) +# printing results +pd.set_option('display.expand_frame_repr', False) +report['estim-prev'] = report['estim-prev'].map(F.strprev) print(report) + +print('Averaged values:') +print(report.mean()) diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 19d365b..202b5dd 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -501,17 +501,25 @@ class EMQ(AggregativeProbabilisticQuantifier): maximum-likelihood estimation, in a mutually recursive way, until convergence. :param learner: a sklearn's Estimator that generates a classifier + :param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence; + or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected + value of the posterior probabilities of the trianing documents as suggested in + `Alexandari et al. paper `_: """ MAX_ITER = 1000 EPSILON = 1e-4 - def __init__(self, learner: BaseEstimator): + def __init__(self, learner: BaseEstimator, exact_train_prev=True): self.learner = learner + self.exact_train_prev = exact_train_prev def fit(self, data: LabelledCollection, fit_learner=True): self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) - self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_) + if self.exact_train_prev: + self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_) + else: + self.train_prevalence = PCC(learner=self.learner).fit(data, fit_learner=False).quantify(data.X) return self def aggregate(self, classif_posteriors, epsilon=EPSILON):