launching new KDEy-HD with importance sampling

2023-11-22 12:27:15 +01:00 · 2023-11-22 12:27:15 +01:00 · dcfd61ae5b
parent ac88dc8961
commit dcfd61ae5b
3 changed files with 93 additions and 80 deletions
--- a/distribution_matching/commons.py
+++ b/distribution_matching/commons.py
@ -8,7 +8,7 @@ from distribution_matching.method_dirichlety import DIRy
 from sklearn.linear_model import LogisticRegression
 from method_kdey_closed_efficient import KDEyclosed_efficient

-METHODS  = ['ACC', 'PACC', 'HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-DMhd3', 'DM-CS', 'KDEy-closed++',  'DIR', 'EMQ', 'KDEy-ML'] #['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+', 'EMQ-C',
+METHODS  = ['ACC', 'PACC', 'HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-DMhd3', 'KDEy-DMhd4', 'DM-CS', 'KDEy-closed++',  'DIR', 'EMQ', 'KDEy-ML'] #['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+', 'EMQ-C',
 BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]


@ -129,6 +129,13 @@ def new_method(method, **lr_kwargs):
        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
        param_grid = {**method_params, **hyper_LR}
        quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
+    elif method in ['KDEy-DMhd4']:
+        # This is the new version in which we apply importance sampling, i.e., we compute:
+        #   D(p_a||q) = 1/N sum_x f(p(x)/q(x)) * (q(x)/r(x))
+        # where x ~iid r, with r = p_u, and u = (1/n, 1/n, ..., 1/n) the uniform vector
+        method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
+        param_grid = {**method_params, **hyper_LR}
+        quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
    elif method == 'DM-HD':
        method_params = {
            'nbins': [4,8,16,32],
--- a/distribution_matching/method_kdey.py
+++ b/distribution_matching/method_kdey.py
@ -14,9 +14,10 @@ from quapy.data import LabelledCollection
 from quapy.protocol import APP, UPP
 from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
    DistributionMatching, _get_divergence
+import quapy.functional as F
 import scipy
 from scipy import optimize
-from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
+#from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional


 # TODO: optimize the bandwidth automatically
@ -97,11 +98,7 @@ class KDEy(AggregativeProbabilisticQuantifier):
        if self.engine == 'scipy':
            return kde(posteriors[:, :-1].T)
        elif self.engine == 'sklearn':
-            #print('pdf...')
-            densities = np.exp(kde.score_samples(posteriors))
-            #print('[pdf done]')
-            return densities
-            #return np.exp(kde.score_samples(posteriors))
+            return np.exp(kde.score_samples(posteriors))

    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
        """
@ -130,6 +127,12 @@ class KDEy(AggregativeProbabilisticQuantifier):
            self.samples = qp.functional.uniform_prevalence_sampling(n_classes=data.n_classes, size=self.montecarlo_trials)
            self.sample_densities = [self.pdf(kde_i, self.samples) for kde_i in self.val_densities]
        elif self.target == 'min_divergence':
+            N = self.montecarlo_trials
+            rs = self.random_state
+            self.reference_samples = np.vstack([kde_i.sample(N, random_state=rs) for kde_i in self.val_densities])
+            self.reference_classwise_densities = np.asarray([self.pdf(kde_j, samples_i) for kde_j in self.val_densities])
+            self.reference_density = np.mean(self.reference_classwise_densities, axis=0)  # equiv. to (uniform @ self.reference_classwise_densities)
+        elif self.target == 'min_divergence_deprecated':  # the version of the first draft, with n*N presampled, then alpha*N chosen for class
            self.class_samples = [kde_i.sample(self.montecarlo_trials, random_state=self.random_state) for kde_i in self.val_densities]
            self.class_sample_densities = {}
            for ci, samples_i in enumerate(self.class_samples):
@ -148,66 +151,40 @@ class KDEy(AggregativeProbabilisticQuantifier):
            raise ValueError('unknown target')


-    # this is the variant I have in the current results, which I think is bugged
-    # def _target_divergence_depr(self, posteriors):
-    #     # in this variant we evaluate the divergence using a Montecarlo approach
-    #     n_classes = len(self.val_densities)
-    #
-    #     test_kde = self.get_kde_function(posteriors)
-    #     test_likelihood = self.pdf(test_kde, self.samples)
-    #
-    #     divergence = _get_divergence(self.divergence)
-    #
-    #     def match(prev):
-    #         val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities))
-    #         return divergence(val_likelihood, test_likelihood)
-    #
-    #     # the initial point is set as the uniform distribution
-    #     uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
-    #
-    #     # solutions are bounded to those contained in the unit-simplex
-    #     bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
-    #     constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
-    #     r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
-    #     return r.x
-
-    def _target_divergence_uniform(self, posteriors):
+    # new version in which we retain all n*N examples (sampled from a mixture with uniform parameter), and then
+    # apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS, and not D(q||p_alpha) as
+    # in the first draft
+    def _target_divergence(self, posteriors):
        # in this variant we evaluate the divergence using a Montecarlo approach
        n_classes = len(self.val_densities)

        test_kde = self.get_kde_function(posteriors)
-        test_likelihood = self.pdf(test_kde, self.samples)
+        test_densities = self.pdf(test_kde, self.reference_samples)

-        def f_squared_hellinger(t):
-            return (np.sqrt(t) - 1)**2
-
-        def f_jensen_shannon(t):
-            return -(t+1)*np.log((t+1)/2) + t*np.log(t)
-
-        def fdivergence(pi, qi, f, eps=1e-10):
-            spi = pi+eps
-            sqi = qi+eps
-            return np.mean(f(spi/sqi)*sqi)
+        def f_squared_hellinger(u):
+            return (np.sqrt(u)-1)**2

+        # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
        if self.divergence.lower() == 'hd':
            f = f_squared_hellinger
-        elif self.divergence.lower() == 'js':
-            f = f_jensen_shannon
+        else:
+            raise ValueError('only squared HD is currently implemented')

-        def match(prev):
-            val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities))
-            return fdivergence(val_likelihood, test_likelihood, f)

-        # the initial point is set as the uniform distribution
-        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
+        epsilon = 1e-10
+        qs = test_densities + epsilon
+        rs = self.reference_density + epsilon

-        # solutions are bounded to those contained in the unit-simplex
-        bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
-        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
-        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
-        return r.x

-    def _target_divergence(self, posteriors):
+        def divergence(prev):
+            ps = prev @ self.reference_classwise_densities + epsilon
+            return np.mean( f(ps/qs) * (qs/rs) )
+
+        return F.optim_minimize(divergence, n_classes)
+
+
+    # the first version we explain in the draft, choosing alpha*N from a pool of N for each class and w/o importance sampling
+    def _target_divergence_deprecated(self, posteriors):
        # in this variant we evaluate the divergence using a Montecarlo approach
        n_classes = len(self.val_densities)

@ -233,7 +210,7 @@ class KDEy(AggregativeProbabilisticQuantifier):
            fdivergence = squared_hellinger


-        def match(prev):
+        def dissimilarity(prev):
            # choose the samples according to the prevalence vector
            # e.g., prev = [0.5, 0.3, 0.2] will draw 50% from KDE_0, 30% from KDE_1, and 20% from KDE_2
            #       the points are already pre-sampled and de densities are pre-computed, so that now all that remains
@ -255,14 +232,38 @@ class KDEy(AggregativeProbabilisticQuantifier):
            # then I am computing D(Test||Val), so it should be E_{x ~ Val}[f(Test(x)/Val(x))]
            return fdivergence(test_likelihood, val_likelihood)

-        # the initial point is set as the uniform distribution
-        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
+        return F.optim_minimize(dissimilarity, n_classes)

-        # solutions are bounded to those contained in the unit-simplex
-        bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
-        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
-        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
-        return r.x
+    # this version explores the entire simplex, and then applies importance sampling. We have not really tested it in deep but
+    # seems not to be promising
+    def _target_divergence_uniform(self, posteriors):
+        # in this variant we evaluate the divergence using a Montecarlo approach
+        n_classes = len(self.val_densities)
+
+        test_kde = self.get_kde_function(posteriors)
+        test_likelihood = self.pdf(test_kde, self.samples)
+
+        def f_squared_hellinger(t):
+            return (np.sqrt(t) - 1)**2
+
+        def f_jensen_shannon(t):
+            return -(t+1)*np.log((t+1)/2) + t*np.log(t)
+
+        def fdivergence(pi, qi, f, eps=1e-10):
+            spi = pi+eps
+            sqi = qi+eps
+            return np.mean(f(spi/sqi)*sqi)
+
+        if self.divergence.lower() == 'hd':
+            f = f_squared_hellinger
+        elif self.divergence.lower() == 'js':
+            f = f_jensen_shannon
+
+        def dissimilarity(prev):
+            val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities))
+            return fdivergence(val_likelihood, test_likelihood, f)
+
+        return F.optim_minimize(dissimilarity, n_classes)

    def _target_likelihood(self, posteriors, eps=0.000001):
        """
@ -278,24 +279,9 @@ class KDEy(AggregativeProbabilisticQuantifier):

        #return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev))
        def neg_loglikelihood(prev):
-            #print('-neg_likelihood')
-            #val_pdf = self.val_pdf(prev)
-            #test_likelihood = val_pdf(posteriors)
            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
            test_loglikelihood = np.log(test_mixture_likelihood + eps)
-            neg_log_likelihood = -np.sum(test_loglikelihood)
-            #print('-neg_likelihood [done!]')
-            return neg_log_likelihood
-            #return -np.prod(test_likelihood)
+            return  -np.sum(test_loglikelihood)

-        # the initial point is set as the uniform distribution
-        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
-
-        # solutions are bounded to those contained in the unit-simplex
-        bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
-        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
-        #print('searching for alpha')
-        r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
-        #print('[optimization ended]')
-        return r.x
+        return F.optim_minimize(neg_loglikelihood, n_classes)

--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -298,3 +298,23 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
    return True


+def optim_minimize(loss, n_classes):
+    """
+    Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
+    that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
+    SLSQP routine.
+
+    :param loss: (callable) the function to minimize
+    :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
+    :return: (ndarray) the best prevalence vector found
+    """
+    from scipy import optimize
+
+    # the initial point is set as the uniform distribution
+    uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
+
+    # solutions are bounded to those contained in the unit-simplex
+    bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
+    constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
+    r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
+    return r.x