the heuristic exact_train_prev is performed via kFCV, using a new function qp.model_selection.cross_val_predict

2022-12-12 17:32:30 +01:00 · 2022-12-12 17:32:30 +01:00 · c20d9d5ea4
parent eb860e9678
commit c20d9d5ea4
3 changed files with 39 additions and 3 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -3,6 +3,8 @@ clean all the cumbersome methods that have to be implemented for new quantifiers
 make truly parallel the GridSearchQ
 make more examples in the "examples" directory
 merge with master, because I had to fix some problems with QuaNet due to an issue notified via GitHub!
+added cross_val_predict in qp.model_selection (i.e., a cross_val_predict for quantification) --would be nice to have
+    it parallelized


 Packaging:
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -3,7 +3,7 @@ from copy import deepcopy
 from typing import Callable, Union
 import numpy as np
 from joblib import Parallel, delayed
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, clone
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import StratifiedKFold, cross_val_predict
@ -503,7 +503,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
    :param learner: a sklearn's Estimator that generates a classifier
    :param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence;
        or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected
-        value of the posterior probabilities of the trianing documents as suggested in
+        value of the posterior probabilities of the training instances as suggested in
        `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
    """

@ -519,7 +519,12 @@ class EMQ(AggregativeProbabilisticQuantifier):
        if self.exact_train_prev:
            self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
        else:
-            self.train_prevalence = PCC(learner=self.learner).fit(data, fit_learner=False).quantify(data.X)
+            self.train_prevalence = qp.model_selection.cross_val_predict(
+                quantifier=PCC(clone(self.learner)),
+                data=data,
+                nfolds=3,
+                random_state=0
+            )
        return self

    def aggregate(self, classif_posteriors, epsilon=EPSILON):
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -2,6 +2,10 @@ import itertools
 import signal
 from copy import deepcopy
 from typing import Union, Callable
+
+import numpy as np
+from sklearn import clone
+
 import quapy as qp
 from quapy import evaluation
 from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
@ -187,3 +191,28 @@ class GridSearchQ(BaseQuantifier):
        raise ValueError('best_model called before fit')


+
+
+def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfolds=3, random_state=0):
+    """
+    Akin to `scikit-learn's cross_val_predict <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html>`_
+    but for quantification.
+
+    :param quantifier: a quantifier issuing class prevalence values
+    :param data: a labelled collection
+    :param nfolds: number of folds for k-fold cross validation generation
+    :param random_state: random seed for reproducibility
+    :return: a vector of class prevalence values
+    """
+
+    total_prev = np.zeros(shape=data.n_classes)
+
+    for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
+        quantifier.fit(train)
+        fold_prev = quantifier.quantify(test.X)
+        rel_size = len(test.X)/len(data)
+        total_prev += fold_prev*rel_size
+
+    return total_prev
+
+