ipacc test

This commit is contained in:
Alejandro Moreo Fernandez 2021-06-22 08:21:41 +02:00
parent 986e61620c
commit fe8010978d
1 changed files with 67 additions and 4 deletions

View File

@ -316,16 +316,21 @@ class PACC(AggregativeProbabilisticQuantifier):
self.pcc = PCC(self.learner)
self.Pte_cond_estim_ = self.ProbConfusionTable(classes, y, y_)
return self
@classmethod
def ProbConfusionTable(cls, classes, y, y_):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
n_classes = len(classes)
confusion = np.empty(shape=(n_classes, n_classes))
for i, class_ in enumerate(classes):
confusion[i] = y_[y == class_].mean(axis=0)
sel = y_[y == class_]
confusion[i] = sel.mean(axis=0) if sel.size>0 else 0
self.Pte_cond_estim_ = confusion.T
return self
return confusion.T
def aggregate(self, classif_posteriors):
prevs_estim = self.pcc.aggregate(classif_posteriors)
@ -391,6 +396,64 @@ class EMQ(AggregativeProbabilisticQuantifier):
return qs, ps
class IPACC(AggregativeProbabilisticQuantifier):
MAX_ITER = 2
EPSILON = 1e-2
def __init__(self, learner: BaseEstimator):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True):
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
return self
def aggregate(self, classif_posteriors, epsilon=EPSILON):
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
return priors
def predict_proba(self, instances, epsilon=EPSILON):
classif_posteriors = self.learner.predict_proba(instances)
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
return posteriors
@classmethod
def EM(cls, tr_prev, posterior_probabilities, epsilon=EPSILON):
from scipy.special import softmax
Px = posterior_probabilities
Ptr = np.copy(tr_prev)
qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence
classes = np.arange(posterior_probabilities.shape[1])
s, converged = 0, False
qs_prev_ = None
while not converged and s < EMQ.MAX_ITER:
# E-step: ps is Ps(y|xi)
ps_unnormalized = (qs / Ptr) * Px
ps = ps_unnormalized / ps_unnormalized.sum(axis=1, keepdims=True)
# ps = softmax(ps_unnormalized, axis=1)
# M-step:
y_belief = np.argmax(ps, axis=-1)
p_conf_table = PACC.ProbConfusionTable(classes, y_belief, ps)
pcc_estim = ps.mean(axis=0)
pacc_estims = ACC.solve_adjustment(p_conf_table, pcc_estim)
qs = pcc_estim*0.5 + pacc_estims*0.5
if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s > 10:
converged = True
qs_prev_ = qs
s += 1
# print(s, qs_prev_)
if not converged:
print('[warning] the method has reached the maximum number of iterations; it might have not converged')
return qs, ps
class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
"""
Implementation of the method based on the Hellinger Distance y (HDy) proposed by