the heuristic exact_train_prev is performed via kFCV, using a new function qp.model_selection.cross_val_predict

This commit is contained in:
Alejandro Moreo Fernandez 2022-12-12 17:32:30 +01:00
parent eb860e9678
commit c20d9d5ea4
3 changed files with 39 additions and 3 deletions

View File

@ -3,6 +3,8 @@ clean all the cumbersome methods that have to be implemented for new quantifiers
make truly parallel the GridSearchQ
make more examples in the "examples" directory
merge with master, because I had to fix some problems with QuaNet due to an issue notified via GitHub!
added cross_val_predict in qp.model_selection (i.e., a cross_val_predict for quantification) --would be nice to have
it parallelized
Packaging:

View File

@ -3,7 +3,7 @@ from copy import deepcopy
from typing import Callable, Union
import numpy as np
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator
from sklearn.base import BaseEstimator, clone
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_predict
@ -503,7 +503,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
:param learner: a sklearn's Estimator that generates a classifier
:param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence;
or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected
value of the posterior probabilities of the trianing documents as suggested in
value of the posterior probabilities of the training instances as suggested in
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
"""
@ -519,7 +519,12 @@ class EMQ(AggregativeProbabilisticQuantifier):
if self.exact_train_prev:
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
else:
self.train_prevalence = PCC(learner=self.learner).fit(data, fit_learner=False).quantify(data.X)
self.train_prevalence = qp.model_selection.cross_val_predict(
quantifier=PCC(clone(self.learner)),
data=data,
nfolds=3,
random_state=0
)
return self
def aggregate(self, classif_posteriors, epsilon=EPSILON):

View File

@ -2,6 +2,10 @@ import itertools
import signal
from copy import deepcopy
from typing import Union, Callable
import numpy as np
from sklearn import clone
import quapy as qp
from quapy import evaluation
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
@ -187,3 +191,28 @@ class GridSearchQ(BaseQuantifier):
raise ValueError('best_model called before fit')
def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfolds=3, random_state=0):
"""
Akin to `scikit-learn's cross_val_predict <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html>`_
but for quantification.
:param quantifier: a quantifier issuing class prevalence values
:param data: a labelled collection
:param nfolds: number of folds for k-fold cross validation generation
:param random_state: random seed for reproducibility
:return: a vector of class prevalence values
"""
total_prev = np.zeros(shape=data.n_classes)
for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
quantifier.fit(train)
fold_prev = quantifier.quantify(test.X)
rel_size = len(test.X)/len(data)
total_prev += fold_prev*rel_size
return total_prev