Merge branch 'master' of github.com:HLT-ISTI/QuaPy

This commit is contained in:
Alejandro Moreo Fernandez 2022-04-12 17:23:39 +02:00
commit 9f4a9cb3fd
1 changed files with 51 additions and 11 deletions

View File

@ -308,16 +308,27 @@ class ACC(AggregativeQuantifier):
self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split)
y_ = self.learner.predict(val_data.instances)
y = val_data.labels
class_count = val_data.counts()
self.cc = CC(self.learner)
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
self.Pte_cond_estim_ = self.getPteCondEstim(data.classes_, y, y_)
return self
@classmethod
def getPteCondEstim(cls, classes, y, y_):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
conf = confusion_matrix(y, y_, labels=classes).T
conf = conf.astype(np.float)
class_counts = conf.sum(axis=0)
for i, _ in enumerate(classes):
if class_counts[i] == 0:
conf[i, i] = 1
else:
conf[:, i] /= class_counts[i]
return conf
def classify(self, data):
return self.cc.classify(data)
@ -439,10 +450,23 @@ class PACC(AggregativeProbabilisticQuantifier):
for i, class_ in enumerate(classes):
confusion[i] = y_[y == class_].mean(axis=0)
self.Pte_cond_estim_ = confusion.T
self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_)
return self
@classmethod
def getPteCondEstim(cls, classes, y, y_):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
n_classes = len(classes)
confusion = np.eye(n_classes)
for i, class_ in enumerate(classes):
idx = y == class_
if idx.any():
confusion[i] = y_[idx].mean(axis=0)
return confusion.T
def aggregate(self, classif_posteriors):
prevs_estim = self.pcc.aggregate(classif_posteriors)
return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
@ -458,15 +482,25 @@ class EMQ(AggregativeProbabilisticQuantifier):
EMQ consists of using the well-known `Expectation Maximization algorithm` to iteratively update the posterior
probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via
maximum-likelihood estimation, in a mutually recursive way, until convergence.
The `transform_prior` callback allows you to introduce ad-hoc regularizations which are not part of the
original EMQ algorithm. This callback can, for instance, enhance or diminish small class prevalences if
sparse or dense solutions should be promoted.
The original method is described in:
Saerens, M., Latinne, P., and Decaestecker, C. (2002).
Adjusting the outputs of a classifier to new a priori probabilities: A simple procedure.
Neural Computation, 14(1): 2141.
:param learner: a sklearn's Estimator that generates a classifier
:param transform_prior: an optional function :math:`R^c -> R^c` that transforms each intermediate estimate
"""
MAX_ITER = 1000
EPSILON = 1e-4
def __init__(self, learner: BaseEstimator):
def __init__(self, learner: BaseEstimator, transform_prior=None):
self.learner = learner
self.transform_prior = transform_prior
def fit(self, data: LabelledCollection, fit_learner=True):
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
@ -474,27 +508,28 @@ class EMQ(AggregativeProbabilisticQuantifier):
return self
def aggregate(self, classif_posteriors, epsilon=EPSILON):
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon, self.transform_prior)
return priors
def predict_proba(self, instances, epsilon=EPSILON):
classif_posteriors = self.learner.predict_proba(instances)
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon, self.transform_prior)
return posteriors
@classmethod
def EM(cls, tr_prev, posterior_probabilities, epsilon=EPSILON):
def EM(cls, tr_prev, posterior_probabilities, epsilon=EPSILON, transform_prior=None):
"""
Computes the `Expectation Maximization` routine.
:param tr_prev: array-like, the training prevalence
:param posterior_probabilities: `np.ndarray` of shape `(n_instances, n_classes,)` with the
posterior probabilities
:param epsilon: float, the threshold different between two consecutive iterations
to reach before stopping the loop
:param transform_prior: an optional function :math:`R^c -> R^c` that transforms each intermediate estimate
:return: a tuple with the estimated prevalence values (shape `(n_classes,)`) and
the corrected posterior probabilities (shape `(n_instances, n_classes,)`)
"""
Px = posterior_probabilities
Ptr = np.copy(tr_prev)
qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence
@ -515,12 +550,17 @@ class EMQ(AggregativeProbabilisticQuantifier):
qs_prev_ = qs
s += 1
# transformation of intermediate estimates
if transform_prior is not None and not converged:
qs = transform_prior(qs)
if not converged:
print('[warning] the method has reached the maximum number of iterations; it might have not converged')
return qs, ps
class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
"""
`Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy).
@ -1121,4 +1161,4 @@ class OneVsAll(AggregativeQuantifier):
:return: boolean
"""
return self.binary_quantifier.probabilistic
return self.binary_quantifier.probabilistic