reduction kdey

2024-09-25 13:34:34 +02:00 · 2024-09-25 13:34:34 +02:00 · da006ee89a
parent 4a3b18b3a3
commit da006ee89a
2 changed files with 51 additions and 16 deletions
--- a/KDEy/kdey_devel.py
+++ b/KDEy/kdey_devel.py
@ -13,6 +13,7 @@ import quapy.functional as F
 from sklearn.metrics.pairwise import rbf_kernel
 from scipy import optimize
 from tqdm import tqdm
 import quapy.functional as F
 epsilon = 1e-10
@ -102,6 +103,7 @@ class KDEyMLauto(KDEyML):
        bounds = [(0.00001, 1)]
        r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds)
        print(f'iterations-bandwidth={r.nit}')
        assert r.success, 'Process did not converge!'
        return r.x[0]
    def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
@ -120,6 +122,7 @@ class KDEyMLauto(KDEyML):
        prevalence_bandwidth = np.append(current_prev, current_bandwidth)
        r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
        print(f'iterations-both={r.nit}')
        assert r.success, 'Process did not converge!'
        prev_band = r.x
        current_prevalence = prev_band[:-1]
        current_bandwidth = prev_band[-1]
@ -141,6 +144,7 @@ class KDEyMLauto(KDEyML):
        prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth))
        r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
        print(f'iterations-both-fine={r.nit}')
        assert r.success, 'Process did not converge!'
        prev_band = r.x
        current_prevalence = prev_band[:n_classes]
        current_bandwidth = prev_band[n_classes:]
@ -213,7 +217,7 @@ class KDEyMLauto(KDEyML):
        init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,))
        def neglikelihood_band(bandwidth):
-            mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
+            mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth[0])
            test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
            def neg_loglikelihood_prev(prev):
@ -225,10 +229,11 @@ class KDEyMLauto(KDEyML):
            return neglikelihood
-        bounds = [(0.0001, 1)]
+        bounds = [(0.0001, 0.2)]
        r = optimize.minimize(neglikelihood_band, x0=[0.001], method='SLSQP', bounds=bounds)
        best_band = r.x[0]
        assert r.success, 'Process did not converge!'
        print(f'solved in nit={r.nit}')
        return best_band
@ -247,8 +252,9 @@ def optim_minimize(loss: Callable, init_prev: np.ndarray, return_loss=False):
    # solutions are bounded to those contained in the unit-simplex
    bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
    constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
-    r = optimize.minimize(loss, x0=init_prev, method='SLSQP', bounds=bounds, constraints=constraints, tol=1e-10)
+    r = optimize.minimize(loss, x0=init_prev, method='SLSQP', bounds=bounds, constraints=constraints)
    # print(f'iterations-prevalence={r.nit}')
    assert r.success, 'Process did not converge!'
    if return_loss:
        return r.x, r.fun
    else:
@ -299,27 +305,36 @@ class KDEyMLauto2(KDEyML):
        if self.target == 'likelihood+':
            def neg_loglikelihood_band_(bandwidth):
                bandwidth=bandwidth[0]
                mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth)
                loss_accum = 0
                for (sample, prevtrue) in prot():
                    test_densities2 = [self.pdf(kde_i, sample) for kde_i in mix_densities]
-                for (sample, prev) in tqdm(prot(), total=repeats):
+                    def neg_loglikelihood_prev(prev):
-                    test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities]
+                        test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities2))
                    def neg_loglikelihood_prev_(prev):
                        test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
                        test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
-                        return -np.sum(test_loglikelihood)
+                        nll = -np.sum(test_loglikelihood)
                        # print(f'\t\tprev={F.strprev(prev)} got {nll=}')
                        return nll
-                    pred_prev, loss_val = optim_minimize(neg_loglikelihood_prev_, init_prev, return_loss=True)
+                    init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,))
-                    loss_accum += loss_val
+                    pred_prev, neglikelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True)
                    # print(f'\t\tprev={F.strprev(pred_prev)} (true={F.strprev(prev)}) got {neglikelihood=}')
                    loss_accum += neglikelihood
                print(f'\t{bandwidth=:.8f} got {loss_accum=:.8f}')
                return loss_accum
            bounds = [tuple((0.0001, 0.2))]
-            init_bandwidth = 0.05
+            init_bandwidth = 0.1
-            r = optimize.minimize(neg_loglikelihood_band_, x0=[init_bandwidth], method='SLSQP', bounds=bounds)
+            r = optimize.minimize(neg_loglikelihood_band_, x0=[init_bandwidth], method='Nelder-Mead', bounds=bounds, tol=1)
            best_band = r.x[0]
            best_loss_val = r.fun
            nit = r.nit
            assert r.success, 'Process did not converge!'
            #found bandwidth=0.00994664 after nit=3 iterations loss_val=-212247.24305)
        else:
            best_band = None
@ -350,5 +365,24 @@ class KDEyMLauto2(KDEyML):
                    best_band = bandwidth
            nit=20
-        print(f'found bandwidth={best_band:.4f} after {nit=} iterations') # (loss_val={best_loss_val:.5f})')
+        print(f'found bandwidth={best_band:.8f} after {nit=} iterations loss_val={best_loss_val:.5f})')
        self.bandwidth_ = best_band
 class KDEyMLred(KDEyML):
    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500):
        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
        self.reduction = reduction
        self.max_reduced = max_reduced
        self.random_state = random_state
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
        n_classes = classif_predictions.n_classes
        tr_length = min(self.reduction * n_classes, self.max_reduced)
        if len(classif_predictions) > tr_length:
            classif_predictions = classif_predictions.sampling(tr_length)
        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
        return self
--- a/KDEy/quantification_evaluation.py
+++ b/KDEy/quantification_evaluation.py
@ -7,7 +7,7 @@ import numpy as np
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
-from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2
+from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2, KDEyMLred
 from quapy.method.aggregative import PACC, EMQ, KDEyML
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
@ -35,10 +35,11 @@ METHODS = [
    ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
    ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),
    ('KDEy-ML',  KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
    ('KDEy-MLred',  KDEyMLred(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
    ('KDEy-ML-scott', KDEyML(newLR(), bandwidth='scott'), wrap_hyper(logreg_grid)),
    ('KDEy-ML-silver', KDEyML(newLR(), bandwidth='silverman'), wrap_hyper(logreg_grid)),
    ('KDEy-ML-autoLike',  KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood'), wrap_hyper(logreg_grid)),
-    ('KDEy-ML-autoLike+',  KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood+'), wrap_hyper(logreg_grid)),
+    # ('KDEy-ML-autoLike+',  KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood+'), wrap_hyper(logreg_grid)), <-- no funciona
    ('KDEy-ML-autoAE',  KDEyMLauto2(newLR(), bandwidth='auto', target='mae'), wrap_hyper(logreg_grid)),
    ('KDEy-ML-autoRAE',  KDEyMLauto2(newLR(), bandwidth='auto', target='mrae'), wrap_hyper(logreg_grid)),
 ]