From 25a829996e0fe141137c1cdb1597d5628c01ea68 Mon Sep 17 00:00:00 2001
From: Alejandro Moreo <alejandro.moreo@isti.cnr.it>
Date: Tue, 14 Feb 2023 11:14:38 +0100
Subject: [PATCH] evaluation updated

---
 examples/explicit_loss_minimization.py |  6 +-
 examples/one_vs_all.py                 |  8 +--
 quapy/CHANGE_LOG.txt                   |  2 +-
 quapy/error.py                         |  4 +-
 quapy/evaluation.py                    | 91 ++++++++++++++++++++++++--
 quapy/method/neural.py                 |  4 +-
 quapy/protocol.py                      | 13 ++--
 quapy/tests/test_evaluation.py         | 33 +++++++++-
 quapy/tests/test_protocols.py          |  8 +--
 9 files changed, 143 insertions(+), 26 deletions(-)

diff --git a/examples/explicit_loss_minimization.py b/examples/explicit_loss_minimization.py
index cefbb3c..fcc07f3 100644
--- a/examples/explicit_loss_minimization.py
+++ b/examples/explicit_loss_minimization.py
@@ -2,7 +2,7 @@ import quapy as qp
 from quapy.method.aggregative import newELM
 from quapy.method.base import newOneVsAll
 from quapy.model_selection import GridSearchQ
-from quapy.protocol import USimplexPP
+from quapy.protocol import UPP
 
 """
 In this example, we will show hoy to define a quantifier based on explicit loss minimization (ELM).
@@ -57,7 +57,7 @@ param_grid = {
     'binary_quantifier__classifier__C': [0.01, 1, 100],  # classifier-dependent hyperparameter
 }
 print('starting model selection')
-model_selection = GridSearchQ(quantifier, param_grid, protocol=USimplexPP(val), verbose=True, refit=False)
+model_selection = GridSearchQ(quantifier, param_grid, protocol=UPP(val), verbose=True, refit=False)
 quantifier = model_selection.fit(train_modsel).best_model()
 
 print('training on the whole training set')
@@ -65,7 +65,7 @@ train, test = qp.datasets.fetch_twitter('hcr', for_model_selection=False, pickle
 quantifier.fit(train)
 
 # evaluation
-mae = qp.evaluation.evaluate(quantifier, protocol=USimplexPP(test), error_metric='mae')
+mae = qp.evaluation.evaluate(quantifier, protocol=UPP(test), error_metric='mae')
 
 print(f'MAE = {mae:.4f}')
 
diff --git a/examples/one_vs_all.py b/examples/one_vs_all.py
index 8aad376..3f5c4ac 100644
--- a/examples/one_vs_all.py
+++ b/examples/one_vs_all.py
@@ -2,7 +2,7 @@ import quapy as qp
 from quapy.method.aggregative import MS2
 from quapy.method.base import newOneVsAll
 from quapy.model_selection import GridSearchQ
-from quapy.protocol import USimplexPP
+from quapy.protocol import UPP
 from sklearn.linear_model import LogisticRegression
 import numpy as np
 
@@ -29,7 +29,7 @@ print(f'the quantifier is an instance of {quantifier.__class__.__name__}')
 train_modsel, val = qp.datasets.fetch_twitter('hcr', for_model_selection=True, pickle=True).train_test
 
 """
-model selection: for this example, we are relying on the USimplexPP protocol, i.e., a variant of the 
+model selection: for this example, we are relying on the UPP protocol, i.e., a variant of the 
 artificial-prevalence protocol that generates random samples (100 in this case) for randomly picked priors 
 from the unit simplex. The priors are sampled using the Kraemer algorithm. Note this is in contrast to the 
 standard APP protocol, that instead explores a prefixed grid of prevalence values.
@@ -39,7 +39,7 @@ param_grid = {
     'binary_quantifier__classifier__class_weight': ['balanced', None]  # classifier-dependent hyperparameter
 }
 print('starting model selection')
-model_selection = GridSearchQ(quantifier, param_grid, protocol=USimplexPP(val), verbose=True, refit=False)
+model_selection = GridSearchQ(quantifier, param_grid, protocol=UPP(val), verbose=True, refit=False)
 quantifier = model_selection.fit(train_modsel).best_model()
 
 print('training on the whole training set')
@@ -47,7 +47,7 @@ train, test = qp.datasets.fetch_twitter('hcr', for_model_selection=False, pickle
 quantifier.fit(train)
 
 # evaluation
-mae = qp.evaluation.evaluate(quantifier, protocol=USimplexPP(test), error_metric='mae')
+mae = qp.evaluation.evaluate(quantifier, protocol=UPP(test), error_metric='mae')
 
 print(f'MAE = {mae:.4f}')
 
diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt
index 48cb586..3dae8ca 100644
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@@ -3,7 +3,7 @@ Change Log 0.1.7
 
 - Protocols are now abstracted as instances of AbstractProtocol. There is a new class extending AbstractProtocol called
     AbstractStochasticSeededProtocol, which implements a seeding policy to allow replicate the series of samplings.
-    There are some examples of protocols, APP, NPP, USimplexPP, DomainMixer (experimental).
+    There are some examples of protocols, APP, NPP, UPP, DomainMixer (experimental).
     The idea is to start the sampling by simply calling the __call__ method.
     This change has a great impact in the framework, since many functions in qp.evaluation, qp.model_selection,
     and sampling functions in LabelledCollection relied of the old functions. E.g., the functionality of
diff --git a/quapy/error.py b/quapy/error.py
index c0cd157..c1a8e7f 100644
--- a/quapy/error.py
+++ b/quapy/error.py
@@ -211,11 +211,13 @@ def __check_eps(eps=None):
 
 CLASSIFICATION_ERROR = {f1e, acce}
 QUANTIFICATION_ERROR = {mae, mrae, mse, mkld, mnkld}
+QUANTIFICATION_ERROR_SINGLE = {ae, rae, se, kld, nkld}
 QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, mkld, mnkld, mrae}
 CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR}
 QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR}
+QUANTIFICATION_ERROR_SINGLE_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR_SINGLE}
 QUANTIFICATION_ERROR_SMOOTH_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR_SMOOTH}
-ERROR_NAMES = CLASSIFICATION_ERROR_NAMES | QUANTIFICATION_ERROR_NAMES
+ERROR_NAMES = CLASSIFICATION_ERROR_NAMES | QUANTIFICATION_ERROR_NAMES | QUANTIFICATION_ERROR_SINGLE_NAMES
 
 f1_error = f1e
 acc_error = acce
diff --git a/quapy/evaluation.py b/quapy/evaluation.py
index 4f5de10..0f94940 100644
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@@ -7,7 +7,34 @@ from quapy.method.base import BaseQuantifier
 import pandas as pd
 
 
-def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='auto', verbose=False):
+def prediction(
+        model: BaseQuantifier,
+        protocol: AbstractProtocol,
+        aggr_speedup: Union[str, bool] = 'auto',
+        verbose=False):
+    """
+    Uses a quantification model to generate predictions for the samples generated via a specific protocol.
+    This function is central to all evaluation processes, and is endowed with an optimization to speed-up the
+    prediction of protocols that generate samples from a large collection. The optimization applies to aggregative
+    quantifiers only, and to OnLabelledCollection protocols, and comes down to generating the classification
+    predictions once and for all, and then generating samples over the classification predictions (instead of over
+    the raw instances), so that the classifier prediction is never called again. This behaviour is obtained by
+    setting `aggr_speedup` to 'auto' or True, and is only carried out if the overall process is convenient in terms
+    of computations (e.g., if the number of classification predictions needed for the original collection exceed the
+    number of classification predictions needed for all samples, then the optimization is not undertaken).
+
+    :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier`
+    :param protocol: :class:`quapy.protocol.AbstractProtocol`; if this object is also instance of
+        :class:`quapy.protocol.OnLabelledCollection`, then the aggregation speed-up can be run. This is the protocol
+        in charge of generating the samples for which the model has to issue class prevalence predictions.
+    :param aggr_speedup: whether or not to apply the speed-up. Set to "force" for applying it even if the number of
+        instances in the original collection on which the protocol acts is larger than the number of instances
+        in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
+        convenient or not. Set to False to deactivate.
+    :param verbose: boolean, show or not information in stdout
+    :return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape
+        `(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample
+    """
     assert aggr_speedup in [False, True, 'auto', 'force'], 'invalid value for aggr_speedup'
 
     sout = lambda x: print(x) if verbose else None
@@ -54,8 +81,29 @@ def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=F
 def evaluation_report(model: BaseQuantifier,
                       protocol: AbstractProtocol,
                       error_metrics: Iterable[Union[str,Callable]] = 'mae',
-                      aggr_speedup='auto',
+                      aggr_speedup: Union[str, bool] = 'auto',
                       verbose=False):
+    """
+    Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according
+    to a specific protocol and in terms of one or more evaluation metrics (errors).
+
+
+    :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier`
+    :param protocol: :class:`quapy.protocol.AbstractProtocol`; if this object is also instance of
+        :class:`quapy.protocol.OnLabelledCollection`, then the aggregation speed-up can be run. This is the protocol
+        in charge of generating the samples in which the model is evaluated.
+    :param error_metrics: a string, or list of strings, representing the name(s) of an error function in `qp.error`
+        (e.g., 'mae', the default value), or a callable function, or a list of callable functions, implementing
+        the error function itself.
+    :param aggr_speedup: whether or not to apply the speed-up. Set to "force" for applying it even if the number of
+        instances in the original collection on which the protocol acts is larger than the number of instances
+        in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
+        convenient or not. Set to False to deactivate.
+    :param verbose: boolean, show or not information in stdout
+    :return: a pandas' DataFrame containing the columns 'true-prev' (the true prevalence of each sample),
+        'estim-prev' (the prevalence estimated by the model for each sample), and as many columns as error metrics
+        have been indicated, each displaying the score in terms of that metric for every sample.
+    """
 
     true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose)
     return _prevalence_report(true_prevs, estim_prevs, error_metrics)
@@ -84,9 +132,28 @@ def _prevalence_report(true_prevs, estim_prevs, error_metrics: Iterable[Union[st
 def evaluate(
         model: BaseQuantifier,
         protocol: AbstractProtocol,
-        error_metric:Union[str, Callable],
-        aggr_speedup='auto',
+        error_metric: Union[str, Callable],
+        aggr_speedup: Union[str, bool] = 'auto',
         verbose=False):
+    """
+    Evaluates a quantification model according to a specific sample generation protocol and in terms of one
+    evaluation metric (error).
+
+    :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier`
+    :param protocol: :class:`quapy.protocol.AbstractProtocol`; if this object is also instance of
+        :class:`quapy.protocol.OnLabelledCollection`, then the aggregation speed-up can be run. This is the protocol
+        in charge of generating the samples in which the model is evaluated.
+    :param error_metric: a string representing the name(s) of an error function in `qp.error`
+        (e.g., 'mae'), or a callable function implementing the error function itself.
+    :param aggr_speedup: whether or not to apply the speed-up. Set to "force" for applying it even if the number of
+        instances in the original collection on which the protocol acts is larger than the number of instances
+        in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
+        convenient or not. Set to False to deactivate.
+    :param verbose: boolean, show or not information in stdout
+    :return: if the error metric is not averaged (e.g., 'ae', 'rae'), returns an array of shape `(n_samples,)` with
+        the error scores for each sample; if the error metric is averaged (e.g., 'mae', 'mrae') then returns
+        a single float
+    """
 
     if isinstance(error_metric, str):
         error_metric = qp.error.from_name(error_metric)
@@ -96,9 +163,21 @@ def evaluate(
 
 def evaluate_on_samples(
         model: BaseQuantifier,
-        samples: [qp.data.LabelledCollection],
-        error_metric:Union[str, Callable],
+        samples: Iterable[qp.data.LabelledCollection],
+        error_metric: Union[str, Callable],
         verbose=False):
+    """
+    Evaluates a quantification model on a given set of samples and in terms of one evaluation metric (error).
+
+    :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier`
+    :param samples: a list of samples on which the quantifier is to be evaluated
+    :param error_metric: a string representing the name(s) of an error function in `qp.error`
+        (e.g., 'mae'), or a callable function implementing the error function itself.
+    :param verbose: boolean, show or not information in stdout
+    :return: if the error metric is not averaged (e.g., 'ae', 'rae'), returns an array of shape `(n_samples,)` with
+        the error scores for each sample; if the error metric is averaged (e.g., 'mae', 'mrae') then returns
+        a single float
+    """
 
     return evaluate(model, IterateProtocol(samples), error_metric, aggr_speedup=False, verbose=verbose)
 
diff --git a/quapy/method/neural.py b/quapy/method/neural.py
index e348930..e407aeb 100644
--- a/quapy/method/neural.py
+++ b/quapy/method/neural.py
@@ -6,7 +6,7 @@ import torch
 from torch.nn import MSELoss
 from torch.nn.functional import relu
 
-from protocol import USimplexPP
+from protocol import UPP
 from quapy.method.aggregative import *
 from quapy.util import EarlyStop
 
@@ -218,7 +218,7 @@ class QuaNetTrainer(BaseQuantifier):
         self.quanet.train(mode=train)
         losses = []
         mae_errors = []
-        sampler = USimplexPP(
+        sampler = UPP(
             data,
             sample_size=self.sample_size,
             repeats=iterations,
diff --git a/quapy/protocol.py b/quapy/protocol.py
index 60df09c..a49bfe6 100644
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@@ -327,7 +327,7 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
         return self.repeats
 
 
-class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
+class UPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
     """
     A variant of :class:`APP` that, instead of using a grid of equidistant prevalence values,
     relies on the Kraemer algorithm for sampling unit (k-1)-simplex uniformly at random, with
@@ -348,7 +348,7 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
 
     def __init__(self, data: LabelledCollection, sample_size=None, repeats=100, random_state=0,
                  return_type='sample_prev'):
-        super(USimplexPP, self).__init__(random_state)
+        super(UPP, self).__init__(random_state)
         self.data = data
         self.sample_size = qp._get_sample_size(sample_size)
         self.repeats = repeats
@@ -357,9 +357,9 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
 
     def samples_parameters(self):
         """
-        Return all the necessary parameters to replicate the samples as according to the USimplexPP protocol.
+        Return all the necessary parameters to replicate the samples as according to the UPP protocol.
 
-        :return: a list of indexes that realize the USimplexPP sampling
+        :return: a list of indexes that realize the UPP sampling
         """
         indexes = []
         for prevs in F.uniform_simplex_sampling(n_classes=self.data.n_classes, size=self.repeats):
@@ -474,3 +474,8 @@ class DomainMixer(AbstractStochasticSeededProtocol):
         return self.repeats * len(self.mixture_points)
 
 
+# aliases
+
+ArtificialPrevalenceProtocol = APP
+NaturalPrevalenceProtocol = NPP
+UniformPrevalenceProtocol = UPP
\ No newline at end of file
diff --git a/quapy/tests/test_evaluation.py b/quapy/tests/test_evaluation.py
index db1ddc6..4992d86 100644
--- a/quapy/tests/test_evaluation.py
+++ b/quapy/tests/test_evaluation.py
@@ -1,8 +1,14 @@
 import unittest
+
+import numpy as np
+
 import quapy as qp
 from sklearn.linear_model import LogisticRegression
 from time import time
-from quapy.method.aggregative import EMQ
+
+from error import QUANTIFICATION_ERROR_SINGLE, QUANTIFICATION_ERROR, QUANTIFICATION_ERROR_NAMES, \
+    QUANTIFICATION_ERROR_SINGLE_NAMES
+from quapy.method.aggregative import EMQ, PCC
 from quapy.method.base import BaseQuantifier
 
 
@@ -48,6 +54,31 @@ class EvalTestCase(unittest.TestCase):
 
         self.assertEqual(tend_no_optim>(tend_optim/2), True)
 
+    def test_evaluation_output(self):
+
+        data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
+        train, test = data.training, data.test
+
+        qp.environ['SAMPLE_SIZE']=100
+
+        protocol = qp.protocol.APP(test, random_state=0)
+
+        q = PCC(LogisticRegression()).fit(train)
+
+        single_errors = list(QUANTIFICATION_ERROR_SINGLE_NAMES)
+        averaged_errors = ['m'+e for e in single_errors]
+        single_errors = single_errors + [qp.error.from_name(e) for e in single_errors]
+        averaged_errors = averaged_errors + [qp.error.from_name(e) for e in averaged_errors]
+        for error_metric, averaged_error_metric in zip(single_errors, averaged_errors):
+            score = qp.evaluation.evaluate(q, protocol, error_metric=averaged_error_metric)
+            self.assertTrue(isinstance(score, float))
+
+            scores = qp.evaluation.evaluate(q, protocol, error_metric=error_metric)
+            self.assertTrue(isinstance(scores, np.ndarray))
+
+            self.assertEqual(scores.mean(), score)
+
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/quapy/tests/test_protocols.py b/quapy/tests/test_protocols.py
index c7e4b15..6c76d4b 100644
--- a/quapy/tests/test_protocols.py
+++ b/quapy/tests/test_protocols.py
@@ -1,7 +1,7 @@
 import unittest
 import numpy as np
 from quapy.data import LabelledCollection
-from quapy.protocol import APP, NPP, USimplexPP, DomainMixer, AbstractStochasticSeededProtocol
+from quapy.protocol import APP, NPP, UPP, DomainMixer, AbstractStochasticSeededProtocol
 
 
 def mock_labelled_collection(prefix=''):
@@ -102,14 +102,14 @@ class TestProtocols(unittest.TestCase):
 
     def test_kraemer_replicate(self):
         data = mock_labelled_collection()
-        p = USimplexPP(data, sample_size=5, repeats=10, random_state=42)
+        p = UPP(data, sample_size=5, repeats=10, random_state=42)
 
         samples1 = samples_to_str(p)
         samples2 = samples_to_str(p)
 
         self.assertEqual(samples1, samples2)
 
-        p = USimplexPP(data, sample_size=5, repeats=10)  # <- random_state is by default set to 0
+        p = UPP(data, sample_size=5, repeats=10)  # <- random_state is by default set to 0
 
         samples1 = samples_to_str(p)
         samples2 = samples_to_str(p)
@@ -118,7 +118,7 @@ class TestProtocols(unittest.TestCase):
 
     def test_kraemer_not_replicate(self):
         data = mock_labelled_collection()
-        p = USimplexPP(data, sample_size=5, repeats=10, random_state=None)
+        p = UPP(data, sample_size=5, repeats=10, random_state=None)
 
         samples1 = samples_to_str(p)
         samples2 = samples_to_str(p)