From 0a6185d908d7828bc62503ce26ec4caeae936409 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Sun, 12 Nov 2023 14:45:03 +0100
Subject: [PATCH] refactoring the aggregative quantifiers

---
 quapy/method/aggregative.py | 257 ++++++++++++------------------------
 quapy/method/base.py        |   2 +-
 2 files changed, 85 insertions(+), 174 deletions(-)

diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index a056543..8633039 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -44,12 +44,11 @@ class AggregativeQuantifier(ABC, BaseQuantifier):
             learner has been trained outside the quantifier.
         :return: self
         """
-        classif_predictions = self.classification_fit(data, fit_classifier)
+        classif_predictions = self.classifier_fit_predict(data, fit_classifier)
         self.aggregation_fit(classif_predictions)
         return self
 
-    @abstractmethod
-    def classification_fit(self, data: LabelledCollection, fit_classifier=True):
+    def classifier_fit_predict(self, data: LabelledCollection, fit_classifier=True, predict_on=None):
         """
         Trains the classifier if requested (`fit_classifier=True`) and generate the necessary predictions to
         train the aggregation function.
@@ -57,11 +56,62 @@ class AggregativeQuantifier(ABC, BaseQuantifier):
         :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
         :param fit_classifier: whether to train the learner (default is True). Set to False if the
             learner has been trained outside the quantifier.
+        :param predict_on: specifies the set on which predictions need to be issued. This parameter can
+            be specified as None (default) to indicate no prediction is needed; a float in (0, 1) to
+            indicate the proportion of instances to be used for predictions (the remainder is used for
+            training); an integer >1 to indicate that the predictions must be generated via k-fold
+            cross-validation, using this integer as k; or the data sample itself on which to generate
+            the predictions.
         """
-        ...
+        assert isinstance(fit_classifier, bool), 'unexpected type for "fit_classifier", must be boolean'
+
+        self.__check_classifier()
+
+        if predict_on is None:
+            if fit_classifier:
+                self.classifier.fit(*data.Xy)
+            predictions = None
+
+        elif isinstance(predict_on, float):
+            if fit_classifier:
+                if not (0. < predict_on < 1.):
+                    raise ValueError(f'proportion {predict_on=} out of range, must be in (0,1)')
+                train, val = data.split_stratified(train_prop=(1 - predict_on))
+                self.classifier.fit(*train.Xy)
+                predictions = (self.classify(val.X), val.y)
+            else:
+                raise ValueError(f'wrong type for predict_on: since fit_classifier=False, '
+                                 f'the set on which predictions have to be issued must be '
+                                 f'explicitly indicated')
+
+        elif isinstance(predict_on, LabelledCollection):
+            if fit_classifier:
+                self.classifier.fit(*data.Xy)
+            predictions = (self.classify(predict_on.X), predict_on.y)
+
+        elif isinstance(predict_on, int):
+            if fit_classifier:
+                if not predict_on > 1:
+                    raise ValueError(f'invalid value {predict_on} in fit. '
+                                     f'Specify a integer >1 for kFCV estimation.')
+                    predictions = cross_val_predict(
+                        classifier, *data.Xy, cv=predict_on, n_jobs=self.n_jobs, method=self.__classifier_method())
+                    self.classifier.fit(*data.Xy)
+            else:
+                raise ValueError(f'wrong type for predict_on: since fit_classifier=False, '
+                                 f'the set on which predictions have to be issued must be '
+                                 f'explicitly indicated')
+
+        else:
+            raise ValueError(
+                f'error: param "predict_on" ({type(predict_on)}) not understood; '
+                f'use either a float indicating the split proportion, or a '
+                f'tuple (X,y) indicating the validation partition')
+
+        return predictions
 
     @abstractmethod
-    def aggregation_fit(self, classif_predictions):
+    def aggregation_fit(self, classif_predictions: LabelledCollection):
         """
         Trains the aggregation function.
 
@@ -99,6 +149,13 @@ class AggregativeQuantifier(ABC, BaseQuantifier):
         """
         return self.classifier.predict(instances)
 
+    @property
+    def __classifier_method(self):
+        return 'predict'
+
+    def __check_classifier(self, adapt_if_necessary=False):
+        assert hasattr(self.classifier, 'predict')
+
     def quantify(self, instances):
         """
         Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated
@@ -142,106 +199,20 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier, ABC):
     def classify(self, instances):
         return self.classifier.predict_proba(instances)
 
+    @property
+    def __classifier_method(self):
+        return 'predict_proba'
 
-# Helper
-# ------------------------------------
-def _ensure_probabilistic(classifier):
-    if not hasattr(classifier, 'predict_proba'):
-        print(f'The learner {classifier.__class__.__name__} does not seem to be probabilistic. '
-              f'The learner will be calibrated.')
-        classifier = CalibratedClassifierCV(classifier, cv=5)
-    return classifier
-
-
-def _training_helper(classifier,
-                     data: LabelledCollection,
-                     fit_classifier: bool = True,
-                     ensure_probabilistic=False,
-                     val_split: Union[LabelledCollection, float] = None):
-    """
-    Training procedure common to all Aggregative Quantifiers.
-
-    :param classifier: the learner to be fit
-    :param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
-    :param fit_classifier: whether or not to fit the learner (if False, then bypasses any action)
-    :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
-        learner is not probabilistic, then a CalibratedCV instance of it is trained)
-    :param val_split: if specified as a float, indicates the proportion of training instances that will define the
-        validation split (e.g., 0.3 for using 30% of the training set as validation data); if specified as a
-        LabelledCollection, represents the validation split itself
-    :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
-        or None otherwise) to be used as a validation set for any subsequent parameter fitting
-    """
-    if fit_classifier:
-        if ensure_probabilistic:
-            classifier = _ensure_probabilistic(classifier)
-        if val_split is not None:
-            if isinstance(val_split, float):
-                if not (0 < val_split < 1):
-                    raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)')
-                train, unused = data.split_stratified(train_prop=1 - val_split)
-            elif isinstance(val_split, LabelledCollection):
-                train = data
-                unused = val_split
+    def __check_classifier(self, adapt_if_necessary=False):
+        if not hasattr(self.classifier, 'predict_proba'):
+            if adapt_if_necessary:
+                print(f'warning: The learner {self.classifier.__class__.__name__} does not seem to be '
+                      f'probabilistic. The learner will be calibrated (using CalibratedClassifierCV).')
+                self.classifier = CalibratedClassifierCV(self.classifier, cv=5)
             else:
-                raise ValueError(
-                    f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split '
-                    'proportion, or a LabelledCollection indicating the validation split')
-        else:
-            train, unused = data, None
-
-        if isinstance(classifier, BaseQuantifier):
-            classifier.fit(train)
-        else:
-            classifier.fit(*train.Xy)
-    else:
-        if ensure_probabilistic:
-            if not hasattr(classifier, 'predict_proba'):
-                raise AssertionError('error: the learner cannot be calibrated since fit_classifier is set to False')
-        unused = None
-        if isinstance(val_split, LabelledCollection):
-            unused = val_split
-
-    return classifier, unused
-
-
-def cross_generate_predictions(
-        data,
-        classifier,
-        val_split,
-        probabilistic,
-        fit_classifier,
-        n_jobs
-):
-
-    n_jobs = qp._get_njobs(n_jobs)
-
-    if isinstance(val_split, int):
-        assert fit_classifier == True, \
-            'the parameters for the adjustment cannot be estimated with kFCV with fit_classifier=False'
-
-        if probabilistic:
-            classifier = _ensure_probabilistic(classifier)
-            predict = 'predict_proba'
-        else:
-            predict = 'predict'
-        y_pred = cross_val_predict(classifier, *data.Xy, cv=val_split, n_jobs=n_jobs, method=predict)
-        class_count = data.counts()
-
-        # fit the learner on all data
-        classifier.fit(*data.Xy)
-        y = data.y
-        classes = data.classes_
-    else:
-        classifier, val_data = _training_helper(
-            classifier, data, fit_classifier, ensure_probabilistic=probabilistic, val_split=val_split
-        )
-        y_pred = classifier.predict_proba(val_data.instances) if probabilistic else classifier.predict(val_data.instances)
-        y = val_data.labels
-        classes = val_data.classes_
-        class_count = val_data.counts()
-
-    return classifier, y, y_pred, classes, class_count
+                raise AssertionError(f'error: The learner {self.classifier.__class__.__name__} does not '
+                                     f'seem to be probabilistic. The learner cannot be calibrated since '
+                                     f'fit_classifier is set to False')
 
 
 # Methods
@@ -257,19 +228,7 @@ class CC(AggregativeQuantifier):
     def __init__(self, classifier: BaseEstimator):
         self.classifier = classifier
 
-    def classification_fit(self, data: LabelledCollection, fit_classifier=True):
-        """
-        Trains the classifier unless `fit_classifier` is False, in which case, the classifier is assumed to
-        be already fit and there is nothing else to do.
-
-        :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
-        :param fit_classifier: if False, the classifier is assumed to be fit
-        :return: self
-        """
-        self.classifier, _ = _training_helper(self.classifier, data, fit_classifier)
-        return None
-
-    def aggregation_fit(self, classif_predictions: np.ndarray):
+    def aggregation_fit(self, classif_predictions: LabelledCollection):
         """
         Nothing to do here!
 
@@ -307,33 +266,11 @@ class ACC(AggregativeQuantifier):
         self.val_split = val_split
         self.n_jobs = qp._get_njobs(n_jobs)
 
-    def classification_fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None):
+    def aggregation_fit(self, classif_predictions: LabelledCollection):
         """
-        Trains the classifier and generates, optionally through a cross-validation procedure, the predictions
-        needed for estimating the misclassification rates matrix.
+        Estimates the misclassification rates.
 
-        :param data: the training set
-        :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
-        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
-            validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
-            indicating the validation set itself, or an int indicating the number `k` of folds to be used in `k`-fold
-            cross validation to estimate the parameters
-        :return: self
-        """
-        if val_split is None:
-            val_split = self.val_split
-
-        self.classifier, true_labels, pred_labels, classes, class_count = cross_generate_predictions(
-            data, self.classifier, val_split, probabilistic=False, fit_classifier=fit_classifier, n_jobs=self.n_jobs
-        )
-
-        return (true_labels, pred_labels)
-
-    def aggregation_fit(self, classif_predictions):
-        """
-        Nothing to do here!
-
-        :param classif_predictions: this is actually None
+        :param classif_predictions: classifier predictions with true labels
         """
         true_labels, pred_labels = classif_predictions
         self.cc = CC(self.classifier)
@@ -393,11 +330,7 @@ class PCC(AggregativeProbabilisticQuantifier):
     def __init__(self, classifier: BaseEstimator):
         self.classifier = classifier
 
-    def classification_fit(self, data: LabelledCollection, fit_classifier=True):
-        self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True)
-        return self
-
-    def aggregation_fit(self, classif_predictions: np.ndarray):
+    def aggregation_fit(self, classif_predictions: LabelledCollection):
         """
         Nothing to do here!
 
@@ -429,33 +362,11 @@ class PACC(AggregativeProbabilisticQuantifier):
         self.val_split = val_split
         self.n_jobs = qp._get_njobs(n_jobs)
 
-    def classification_fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None):
+    def aggregation_fit(self, classif_predictions: LabelledCollection):
         """
-        Trains the soft classifier and generates, optionally through a cross-validation procedure, the posterior
-        probabilities needed for estimating the misclassification rates matrix.
+        Estimates the misclassification rates
 
-        :param data: the training set
-        :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
-        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
-            validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
-            indicating the validation set itself, or an int indicating the number `k` of folds to be used in `k`-fold
-            cross validation to estimate the parameters
-        :return: self
-        """
-        if val_split is None:
-            val_split = self.val_split
-
-        self.classifier, true_labels, posteriors, classes, class_count = cross_generate_predictions(
-            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
-        )
-
-        return (true_labels, posteriors)
-
-    def aggregation_fit(self, classif_predictions):
-        """
-        Nothing to do here!
-
-        :param classif_predictions: this is actually None
+        :param classif_predictions: classifier predictions with true labels
         """
         true_labels, posteriors = classif_predictions
         self.pcc = PCC(self.classifier)
@@ -509,7 +420,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
         self.exact_train_prev = exact_train_prev
         self.recalib = recalib
 
-    def classification_fit(self, data: LabelledCollection, fit_classifier=True):
+    def classifier_fit_predict(self, data: LabelledCollection, fit_classifier=True):
         self.classifier, true_labels, posteriors, classes, class_count = cross_generate_predictions(
             data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
         )
@@ -842,7 +753,7 @@ class DMy(AggregativeProbabilisticQuantifier):
             distributions = np.cumsum(distributions, axis=1)
         return distributions
 
-    def classification_fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
+    def classifier_fit_predict(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
         """
         Trains the classifier (if requested) and generates the validation distributions out of the training data.
         The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
diff --git a/quapy/method/base.py b/quapy/method/base.py
index e0363f1..f34acf6 100644
--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@@ -63,7 +63,7 @@ def newOneVsAll(binary_quantifier, n_jobs=None):
         return OneVsAllGeneric(binary_quantifier, n_jobs)
 
 
-class OneVsAllGeneric(OneVsAll,BaseQuantifier):
+class OneVsAllGeneric(OneVsAll, BaseQuantifier):
     """
     Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
     quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1.