the heuristic exact_train_prev is performed via kFCV, using a new function qp.model_selection.cross_val_predict

2022-12-12 17:32:30 +01:00 · 2022-12-12 17:32:30 +01:00 · c20d9d5ea4
parent eb860e9678
commit c20d9d5ea4
3 changed files with 39 additions and 3 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -3,6 +3,8 @@ clean all the cumbersome methods that have to be implemented for new quantifiers
 make truly parallel the GridSearchQ
 make more examples in the "examples" directory
 merge with master, because I had to fix some problems with QuaNet due to an issue notified via GitHub!
 added cross_val_predict in qp.model_selection (i.e., a cross_val_predict for quantification) --would be nice to have
    it parallelized
 Packaging:
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -3,7 +3,7 @@ from copy import deepcopy
 from typing import Callable, Union
 import numpy as np
 from joblib import Parallel, delayed
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, clone
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import StratifiedKFold, cross_val_predict
@ -503,7 +503,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
    :param learner: a sklearn's Estimator that generates a classifier
    :param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence;
        or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected
-        value of the posterior probabilities of the trianing documents as suggested in
+        value of the posterior probabilities of the training instances as suggested in
        `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
    """
@ -519,7 +519,12 @@ class EMQ(AggregativeProbabilisticQuantifier):
        if self.exact_train_prev:
            self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
        else:
-            self.train_prevalence = PCC(learner=self.learner).fit(data, fit_learner=False).quantify(data.X)
+            self.train_prevalence = qp.model_selection.cross_val_predict(
                quantifier=PCC(clone(self.learner)),
                data=data,
                nfolds=3,
                random_state=0
            )
        return self
    def aggregate(self, classif_posteriors, epsilon=EPSILON):
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -2,6 +2,10 @@ import itertools
 import signal
 from copy import deepcopy
 from typing import Union, Callable
 import numpy as np
 from sklearn import clone
 import quapy as qp
 from quapy import evaluation
 from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
@ -187,3 +191,28 @@ class GridSearchQ(BaseQuantifier):
        raise ValueError('best_model called before fit')
 def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfolds=3, random_state=0):
    """
    Akin to `scikit-learn's cross_val_predict <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html>`_
    but for quantification.
    :param quantifier: a quantifier issuing class prevalence values
    :param data: a labelled collection
    :param nfolds: number of folds for k-fold cross validation generation
    :param random_state: random seed for reproducibility
    :return: a vector of class prevalence values
    """
    total_prev = np.zeros(shape=data.n_classes)
    for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
        quantifier.fit(train)
        fold_prev = quantifier.quantify(test.X)
        rel_size = len(test.X)/len(data)
        total_prev += fold_prev*rel_size
    return total_prev