bugfix in protocols, return_type='index' not working

2024-11-19 16:00:03 +01:00 · 2024-11-19 16:00:03 +01:00 · e6ae1e7d77
parent 24c28edfd9
commit e6ae1e7d77
5 changed files with 108 additions and 10 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -0,0 +1,3 @@
+- Test the return_type="index" in protocols and finish the "distributin_samples.py" example
+- Add EDy (an implementation is available at quantificationlib)
+-
--- a/examples/0.basics.py
+++ b/examples/0.basics.py
@ -33,10 +33,8 @@ import quapy.functional as F  # <- this module has some functional utilities, li
 print(f'training prevalence = {F.strprev(train.prevalence())}')

 # let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier
-# classifier = LogisticRegression()
-
-# pacc = qp.method.aggregative.PACC(classifier)
-pacc = qp.method.aggregative.PACC()
+classifier = LogisticRegression()
+pacc = qp.method.aggregative.PACC(classifier)

 print(f'training {pacc}')
 pacc.fit(train)
--- a/examples/distributing_samples.py
+++ b/examples/distributing_samples.py
@ -0,0 +1,38 @@
+"""
+Imagine we want to generate many samples out of a collection, that we want to distribute for others to run their
+own experiments in the very same test samples. One naive solution would come down to applying a given protocol to
+our collection (say the artificial prevalence protocol on the 'academic-success' UCI dataset), store all those samples
+on disk and make them available online. Distributing many such samples is undesirable.
+In this example, we generate the indexes that allow anyone to regenerate the samples out of the original collection.
+"""
+
+import quapy as qp
+from quapy.method.aggregative import PACC
+from quapy.protocol import UPP
+
+data = qp.datasets.fetch_UCIMulticlassDataset('academic-success')
+train, test = data.train_test
+
+# let us train a quantifier to check whether we can actually replicate the results
+quantifier = PACC()
+quantifier.fit(train)
+
+# let us simulate our experimental results
+protocol = UPP(test, sample_size=100, repeats=100, random_state=0)
+our_mae = qp.evaluation.evaluate(quantifier, protocol=protocol, error_metric='mae')
+
+print(f'We have obtained a MAE={our_mae:.3f}')
+
+# let us distribute the indexes; we specify that we want the indexes, not the samples
+protocol = UPP(test, sample_size=100, repeats=100, random_state=0, return_type='index')
+indexes = protocol.samples_parameters()
+
+# Imagine we distribute the indexes; now we show how to replicate our experiments.
+from quapy.protocol import ProtocolFromIndex
+data = qp.datasets.fetch_UCIMulticlassDataset('academic-success')
+train, test = data.train_test
+protocol = ProtocolFromIndex(data=test, indexes=indexes)
+their_mae = qp.evaluation.evaluate(quantifier, protocol=protocol, error_metric='mae')
+
+print(f'Another lab obtains a MAE={our_mae:.3f}')
+
--- a/quapy/error.py
+++ b/quapy/error.py
@ -298,6 +298,31 @@ def nmd(prevs, prevs_hat):
    return (1./(n-1))*np.mean(match_distance(prevs, prevs_hat))


+def bias_binary(prevs, prevs_hat):
+    """
+    Computes the (positive) bias in a binary problem. The bias is simply the difference between the
+    predicted positive value and the true positive value, so that a positive such value indicates the
+    prediction has positive bias (i.e., it tends to overestimate) the true value, and negative otherwise.
+    :math:`bias(p,\\hat{p})=\\hat{p}_1-p_1`,
+    :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
+    :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
+        prevalence values
+    :return: binary bias
+    """
+    assert prevs.shape[-1] == 2 and prevs.shape[-1] == 2, f'bias_binary can only be applied to binary problems'
+    return prevs_hat[...,1]-prevs[...,1]
+
+
+def mean_bias_binary(prevs, prevs_hat):
+    """
+    Computes the mean of the (positive) bias in a binary problem.
+    :param prevs: array-like of shape `(n_classes,)` with the true prevalence values
+    :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
+    :return: mean binary bias
+    """
+    return np.mean(bias_binary(prevs, prevs_hat))
+
+
 def md(prevs, prevs_hat, ERROR_TOL=1E-3):
    """
    Computes the Match Distance, under the assumption that the cost in mistaking class i with class i+1 is 1 in
@ -338,8 +363,8 @@ def __check_eps(eps=None):


 CLASSIFICATION_ERROR = {f1e, acce}
-QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld}
-QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld}
+QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld, mean_bias_binary}
+QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld, bias_binary}
 QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae}
 CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR}
 QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR}
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@ -1,4 +1,6 @@
 from copy import deepcopy
+from typing import Iterable
+
 import quapy as qp
 import numpy as np
 import itertools
@ -62,6 +64,36 @@ class IterateProtocol(AbstractProtocol):
        return len(self.samples)


+class ProtocolFromIndex(AbstractProtocol):
+    """
+    A protocol from a list of indexes
+
+    :param data: a :class:`quapy.data.base.LabelledCollection`
+    :param indexes: a list of indexes
+    """
+    def __init__(self, data: LabelledCollection, indexes: Iterable):
+        self.data = data
+        self.indexes = indexes
+
+    def __call__(self):
+        """
+        Yields one sample at a time extracted using the indexes
+
+        :return: yields a tuple `(sample, prev) at a time, where `sample` is a set of instances
+            and in which `prev` is an `nd.array` with the class prevalence values
+        """
+        for index in self.indexes:
+            yield self.data.sampling_from_index(index).Xp
+
+    def total(self):
+        """
+        Returns the number of samples in this protocol
+
+        :return: int
+        """
+        return len(self.indexes)
+
+
 class AbstractStochasticSeededProtocol(AbstractProtocol):
    """
    An `AbstractStochasticSeededProtocol` is a protocol that generates, via any random procedure (e.g.,
@ -124,9 +156,9 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
            if self.random_state is not None:
                stack.enter_context(qp.util.temp_seed(self.random_state))
            for params in self.samples_parameters():
-                yield self.collator(self.sample(params))
+                yield self.collator(self.sample(params), params)

-    def collator(self, sample, *args):
+    def collator(self, sample, params):
        """
        The collator prepares the sample to accommodate the desired output format before returning the output.
        This collator simply returns the sample as it is. Classes inheriting from this abstract class can
@ -191,9 +223,11 @@ class OnLabelledCollectionProtocol:
        assert return_type in cls.RETURN_TYPES, \
            f'unknown return type passed as argument; valid ones are {cls.RETURN_TYPES}'
        if return_type=='sample_prev':
-            return lambda lc:lc.Xp
+            return lambda lc,params:lc.Xp
        elif return_type=='labelled_collection':
-            return lambda lc:lc
+            return lambda lc,params:lc
+        elif return_type=='index':
+            return lambda lc,params:params


 class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):