diff --git a/KDEy/kdey_devel.py b/KDEy/kdey_devel.py index a0e3b26..fa3713b 100644 --- a/KDEy/kdey_devel.py +++ b/KDEy/kdey_devel.py @@ -13,6 +13,7 @@ import quapy.functional as F from sklearn.metrics.pairwise import rbf_kernel from scipy import optimize from tqdm import tqdm +import quapy.functional as F epsilon = 1e-10 @@ -102,6 +103,7 @@ class KDEyMLauto(KDEyML): bounds = [(0.00001, 1)] r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds) print(f'iterations-bandwidth={r.nit}') + assert r.success, 'Process did not converge!' return r.x[0] def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes): @@ -120,6 +122,7 @@ class KDEyMLauto(KDEyML): prevalence_bandwidth = np.append(current_prev, current_bandwidth) r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints) print(f'iterations-both={r.nit}') + assert r.success, 'Process did not converge!' prev_band = r.x current_prevalence = prev_band[:-1] current_bandwidth = prev_band[-1] @@ -141,6 +144,7 @@ class KDEyMLauto(KDEyML): prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth)) r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints) print(f'iterations-both-fine={r.nit}') + assert r.success, 'Process did not converge!' prev_band = r.x current_prevalence = prev_band[:n_classes] current_bandwidth = prev_band[n_classes:] @@ -213,7 +217,7 @@ class KDEyMLauto(KDEyML): init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) def neglikelihood_band(bandwidth): - mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth) + mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth[0]) test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities] def neg_loglikelihood_prev(prev): @@ -225,10 +229,11 @@ class KDEyMLauto(KDEyML): return neglikelihood - bounds = [(0.0001, 1)] + bounds = [(0.0001, 0.2)] r = optimize.minimize(neglikelihood_band, x0=[0.001], method='SLSQP', bounds=bounds) best_band = r.x[0] + assert r.success, 'Process did not converge!' print(f'solved in nit={r.nit}') return best_band @@ -247,8 +252,9 @@ def optim_minimize(loss: Callable, init_prev: np.ndarray, return_loss=False): # solutions are bounded to those contained in the unit-simplex bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 - r = optimize.minimize(loss, x0=init_prev, method='SLSQP', bounds=bounds, constraints=constraints, tol=1e-10) + r = optimize.minimize(loss, x0=init_prev, method='SLSQP', bounds=bounds, constraints=constraints) # print(f'iterations-prevalence={r.nit}') + assert r.success, 'Process did not converge!' if return_loss: return r.x, r.fun else: @@ -299,27 +305,36 @@ class KDEyMLauto2(KDEyML): if self.target == 'likelihood+': def neg_loglikelihood_band_(bandwidth): + bandwidth=bandwidth[0] mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth) + loss_accum = 0 + for (sample, prevtrue) in prot(): + test_densities2 = [self.pdf(kde_i, sample) for kde_i in mix_densities] - for (sample, prev) in tqdm(prot(), total=repeats): - test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities] - - def neg_loglikelihood_prev_(prev): - test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) + def neg_loglikelihood_prev(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities2)) test_loglikelihood = np.log(test_mixture_likelihood + epsilon) - return -np.sum(test_loglikelihood) + nll = -np.sum(test_loglikelihood) + # print(f'\t\tprev={F.strprev(prev)} got {nll=}') + return nll - pred_prev, loss_val = optim_minimize(neg_loglikelihood_prev_, init_prev, return_loss=True) - loss_accum += loss_val + init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + pred_prev, neglikelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True) + # print(f'\t\tprev={F.strprev(pred_prev)} (true={F.strprev(prev)}) got {neglikelihood=}') + loss_accum += neglikelihood + print(f'\t{bandwidth=:.8f} got {loss_accum=:.8f}') return loss_accum bounds = [tuple((0.0001, 0.2))] - init_bandwidth = 0.05 - r = optimize.minimize(neg_loglikelihood_band_, x0=[init_bandwidth], method='SLSQP', bounds=bounds) + init_bandwidth = 0.1 + r = optimize.minimize(neg_loglikelihood_band_, x0=[init_bandwidth], method='Nelder-Mead', bounds=bounds, tol=1) best_band = r.x[0] + best_loss_val = r.fun nit = r.nit + assert r.success, 'Process did not converge!' + #found bandwidth=0.00994664 after nit=3 iterations loss_val=-212247.24305) else: best_band = None @@ -350,5 +365,24 @@ class KDEyMLauto2(KDEyML): best_band = bandwidth nit=20 - print(f'found bandwidth={best_band:.4f} after {nit=} iterations') # (loss_val={best_loss_val:.5f})') + print(f'found bandwidth={best_band:.8f} after {nit=} iterations loss_val={best_loss_val:.5f})') self.bandwidth_ = best_band + + +class KDEyMLred(KDEyML): + def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500): + self.classifier = qp._get_classifier(classifier) + self.val_split = val_split + self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.reduction = reduction + self.max_reduced = max_reduced + self.random_state = random_state + + def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + n_classes = classif_predictions.n_classes + tr_length = min(self.reduction * n_classes, self.max_reduced) + if len(classif_predictions) > tr_length: + classif_predictions = classif_predictions.sampling(tr_length) + self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth) + return self + diff --git a/KDEy/quantification_evaluation.py b/KDEy/quantification_evaluation.py index 4ad3176..9cc8a8e 100644 --- a/KDEy/quantification_evaluation.py +++ b/KDEy/quantification_evaluation.py @@ -7,7 +7,7 @@ import numpy as np from sklearn.linear_model import LogisticRegression import quapy as qp -from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2 +from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2, KDEyMLred from quapy.method.aggregative import PACC, EMQ, KDEyML from quapy.model_selection import GridSearchQ from quapy.protocol import UPP @@ -35,10 +35,11 @@ METHODS = [ ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)), ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}), + ('KDEy-MLred', KDEyMLred(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}), ('KDEy-ML-scott', KDEyML(newLR(), bandwidth='scott'), wrap_hyper(logreg_grid)), ('KDEy-ML-silver', KDEyML(newLR(), bandwidth='silverman'), wrap_hyper(logreg_grid)), ('KDEy-ML-autoLike', KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood'), wrap_hyper(logreg_grid)), - ('KDEy-ML-autoLike+', KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood+'), wrap_hyper(logreg_grid)), + # ('KDEy-ML-autoLike+', KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood+'), wrap_hyper(logreg_grid)), <-- no funciona ('KDEy-ML-autoAE', KDEyMLauto2(newLR(), bandwidth='auto', target='mae'), wrap_hyper(logreg_grid)), ('KDEy-ML-autoRAE', KDEyMLauto2(newLR(), bandwidth='auto', target='mrae'), wrap_hyper(logreg_grid)), ]