adding the possibility to estimate the training prevalence, instead of using the true training prevalence, as a starting point in emq
This commit is contained in:
parent
643a19228b
commit
eb860e9678
|
@ -1,6 +1,8 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
|
import quapy.functional as F
|
||||||
from data.datasets import LEQUA2022_SAMPLE_SIZE, fetch_lequa2022
|
from data.datasets import LEQUA2022_SAMPLE_SIZE, fetch_lequa2022
|
||||||
from evaluation import evaluation_report
|
from evaluation import evaluation_report
|
||||||
from method.aggregative import EMQ
|
from method.aggregative import EMQ
|
||||||
|
@ -14,7 +16,8 @@ qp.environ['SAMPLE_SIZE'] = LEQUA2022_SAMPLE_SIZE[task]
|
||||||
training, val_generator, test_generator = fetch_lequa2022(task=task)
|
training, val_generator, test_generator = fetch_lequa2022(task=task)
|
||||||
|
|
||||||
# define the quantifier
|
# define the quantifier
|
||||||
quantifier = EMQ(learner=LogisticRegression())
|
learner = CalibratedClassifierCV(LogisticRegression())
|
||||||
|
quantifier = EMQ(learner=learner)
|
||||||
|
|
||||||
# model selection
|
# model selection
|
||||||
param_grid = {'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]}
|
param_grid = {'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]}
|
||||||
|
@ -24,6 +27,10 @@ quantifier = model_selection.fit(training)
|
||||||
# evaluation
|
# evaluation
|
||||||
report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True)
|
report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True)
|
||||||
|
|
||||||
pd.set_option('display.max_columns', None)
|
# printing results
|
||||||
pd.set_option('display.width', 1000)
|
pd.set_option('display.expand_frame_repr', False)
|
||||||
|
report['estim-prev'] = report['estim-prev'].map(F.strprev)
|
||||||
print(report)
|
print(report)
|
||||||
|
|
||||||
|
print('Averaged values:')
|
||||||
|
print(report.mean())
|
||||||
|
|
|
@ -501,17 +501,25 @@ class EMQ(AggregativeProbabilisticQuantifier):
|
||||||
maximum-likelihood estimation, in a mutually recursive way, until convergence.
|
maximum-likelihood estimation, in a mutually recursive way, until convergence.
|
||||||
|
|
||||||
:param learner: a sklearn's Estimator that generates a classifier
|
:param learner: a sklearn's Estimator that generates a classifier
|
||||||
|
:param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence;
|
||||||
|
or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected
|
||||||
|
value of the posterior probabilities of the trianing documents as suggested in
|
||||||
|
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
MAX_ITER = 1000
|
MAX_ITER = 1000
|
||||||
EPSILON = 1e-4
|
EPSILON = 1e-4
|
||||||
|
|
||||||
def __init__(self, learner: BaseEstimator):
|
def __init__(self, learner: BaseEstimator, exact_train_prev=True):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
self.exact_train_prev = exact_train_prev
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||||
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
||||||
|
if self.exact_train_prev:
|
||||||
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
|
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
|
||||||
|
else:
|
||||||
|
self.train_prevalence = PCC(learner=self.learner).fit(data, fit_learner=False).quantify(data.X)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
||||||
|
|
Loading…
Reference in New Issue