the heuristic exact_train_prev is performed via kFCV, using a new function qp.model_selection.cross_val_predict
This commit is contained in:
parent
eb860e9678
commit
c20d9d5ea4
2
TODO.txt
2
TODO.txt
|
@ -3,6 +3,8 @@ clean all the cumbersome methods that have to be implemented for new quantifiers
|
|||
make truly parallel the GridSearchQ
|
||||
make more examples in the "examples" directory
|
||||
merge with master, because I had to fix some problems with QuaNet due to an issue notified via GitHub!
|
||||
added cross_val_predict in qp.model_selection (i.e., a cross_val_predict for quantification) --would be nice to have
|
||||
it parallelized
|
||||
|
||||
|
||||
Packaging:
|
||||
|
|
|
@ -3,7 +3,7 @@ from copy import deepcopy
|
|||
from typing import Callable, Union
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.base import BaseEstimator, clone
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import StratifiedKFold, cross_val_predict
|
||||
|
@ -503,7 +503,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
|
|||
:param learner: a sklearn's Estimator that generates a classifier
|
||||
:param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence;
|
||||
or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected
|
||||
value of the posterior probabilities of the trianing documents as suggested in
|
||||
value of the posterior probabilities of the training instances as suggested in
|
||||
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
|
||||
"""
|
||||
|
||||
|
@ -519,7 +519,12 @@ class EMQ(AggregativeProbabilisticQuantifier):
|
|||
if self.exact_train_prev:
|
||||
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
|
||||
else:
|
||||
self.train_prevalence = PCC(learner=self.learner).fit(data, fit_learner=False).quantify(data.X)
|
||||
self.train_prevalence = qp.model_selection.cross_val_predict(
|
||||
quantifier=PCC(clone(self.learner)),
|
||||
data=data,
|
||||
nfolds=3,
|
||||
random_state=0
|
||||
)
|
||||
return self
|
||||
|
||||
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
||||
|
|
|
@ -2,6 +2,10 @@ import itertools
|
|||
import signal
|
||||
from copy import deepcopy
|
||||
from typing import Union, Callable
|
||||
|
||||
import numpy as np
|
||||
from sklearn import clone
|
||||
|
||||
import quapy as qp
|
||||
from quapy import evaluation
|
||||
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
|
||||
|
@ -187,3 +191,28 @@ class GridSearchQ(BaseQuantifier):
|
|||
raise ValueError('best_model called before fit')
|
||||
|
||||
|
||||
|
||||
|
||||
def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfolds=3, random_state=0):
|
||||
"""
|
||||
Akin to `scikit-learn's cross_val_predict <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html>`_
|
||||
but for quantification.
|
||||
|
||||
:param quantifier: a quantifier issuing class prevalence values
|
||||
:param data: a labelled collection
|
||||
:param nfolds: number of folds for k-fold cross validation generation
|
||||
:param random_state: random seed for reproducibility
|
||||
:return: a vector of class prevalence values
|
||||
"""
|
||||
|
||||
total_prev = np.zeros(shape=data.n_classes)
|
||||
|
||||
for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
|
||||
quantifier.fit(train)
|
||||
fold_prev = quantifier.quantify(test.X)
|
||||
rel_size = len(test.X)/len(data)
|
||||
total_prev += fold_prev*rel_size
|
||||
|
||||
return total_prev
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue