experiment

2025-09-27 17:41:12 +02:00 · 2025-09-27 17:41:12 +02:00 · c3fd92efde
parent 636e33318f
commit c3fd92efde
4 changed files with 242 additions and 8 deletions
--- a/KDEyAitchison/commons.py
+++ b/KDEyAitchison/commons.py
@ -0,0 +1,52 @@
+import numpy as np
+import pandas as pd
+
+from quapy.method.aggregative import EMQ, KDEyML
+from sklearn.linear_model import LogisticRegression
+
+METHODS = ['EMQ',
+           # 'KDEy-ML',
+           'KDEy-MLA'
+           ]
+
+
+# common hyperparameterss
+hyper_LR = {
+    'classifier__C': np.logspace(-3, 3, 7),
+    'classifier__class_weight': ['balanced', None]
+}
+
+hyper_kde = {
+    'bandwidth': np.linspace(0.01, 0.2, 20)
+}
+
+hyper_kde_aitchison = {
+    'bandwidth': np.linspace(0.01, 2, 100)
+}
+
+# instances a new quantifier based on a string name
+def new_method(method, **lr_kwargs):
+    lr = LogisticRegression(**lr_kwargs)
+
+    if method == 'KDEy-ML':
+        param_grid = {**hyper_kde, **hyper_LR}
+        quantifier = KDEyML(lr, kernel='gaussian')
+    elif method == 'KDEy-MLA':
+        param_grid = {**hyper_kde_aitchison, **hyper_LR}
+        quantifier = KDEyML(lr, kernel='aitchison')
+    elif method == 'EMQ':
+        param_grid = hyper_LR
+        quantifier = EMQ(lr)
+    else:
+        raise NotImplementedError('unknown method', method)
+
+    return param_grid, quantifier
+
+
+def show_results(result_path):
+    df = pd.read_csv(result_path+'.csv', sep='\t')
+
+    pd.set_option('display.max_columns', None)
+    pd.set_option('display.max_rows', None)
+    pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
+    print(pv)
--- a/KDEyAitchison/show_results.py
+++ b/KDEyAitchison/show_results.py
@ -0,0 +1,34 @@
+import pickle
+import os
+import sys
+
+import pandas as pd
+
+import quapy as qp
+from quapy.model_selection import GridSearchQ
+from quapy.protocol import UPP
+from commons import METHODS, new_method, show_results
+
+
+SEED = 1
+
+
+
+if __name__ == '__main__':
+    print(qp.datasets.UCI_MULTICLASS_DATASETS)
+    for optim in ['mae']:
+        result_dir = f'results/ucimulti/{optim}'
+
+        for method in METHODS:
+            print()
+            global_result_path = f'{result_dir}/{method}'
+            print(f'Method\tDataset\tMAE\tMRAE\tKLD')
+            for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
+                print(dataset)
+                local_result_path = global_result_path + '_' + dataset
+                if os.path.exists(local_result_path + '.dataframe'):
+                    report = pd.read_csv(local_result_path+'.dataframe')
+                    print(f'{method}\t{dataset}\t{report["mae"].mean():.5f}')
+                else:
+                    print(dataset, 'not found')
+
--- a/KDEyAitchison/ucimulti_experiments.py
+++ b/KDEyAitchison/ucimulti_experiments.py
@ -0,0 +1,94 @@
+import pickle
+import os
+import sys
+
+import pandas as pd
+
+import quapy as qp
+from quapy.model_selection import GridSearchQ
+from quapy.protocol import UPP
+from commons import METHODS, new_method, show_results
+
+
+SEED = 1
+
+
+
+if __name__ == '__main__':
+
+    qp.environ['SAMPLE_SIZE'] = 500
+    qp.environ['N_JOBS'] = -1
+    n_bags_val = 250
+    n_bags_test = 1000
+    for optim in ['mae']:
+        result_dir = f'results/ucimulti/{optim}'
+
+        os.makedirs(result_dir, exist_ok=True)
+
+        for method in METHODS:
+
+            print('Init method', method)
+
+            global_result_path = f'{result_dir}/{method}'
+            # show_results(global_result_path)
+            # sys.exit(0)
+
+            if not os.path.exists(global_result_path + '.csv'):
+                with open(global_result_path + '.csv', 'wt') as csv:
+                    csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
+
+            with open(global_result_path + '.csv', 'at') as csv:
+
+                for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
+
+                    print('init', dataset)
+
+                    local_result_path = global_result_path + '_' + dataset
+                    if os.path.exists(local_result_path + '.dataframe'):
+                        print(f'result file {local_result_path}.dataframe already exist; skipping')
+                        report = pd.read_csv(local_result_path+'.dataframe')
+                        print(report["mae"].mean())
+                        # data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
+                        # csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
+                        continue
+
+                    with qp.util.temp_seed(SEED):
+
+                        param_grid, quantifier = new_method(method, max_iter=3000)
+
+                        data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
+
+                        # model selection
+                        train, test = data.train_test
+                        train, val = train.split_stratified(random_state=SEED)
+
+                        protocol = UPP(val, repeats=n_bags_val)
+                        modsel = GridSearchQ(
+                            quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=True, error=optim
+                        )
+
+                        try:
+                            modsel.fit(*train.Xy)
+
+                            print(f'best params {modsel.best_params_}')
+                            print(f'best score {modsel.best_score_}')
+                            pickle.dump(
+                                (modsel.best_params_, modsel.best_score_,),
+                                open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
+
+                            quantifier = modsel.best_model()
+                        except:
+                            print('something went wrong... trying to fit the default model')
+                            quantifier.fit(*train.Xy)
+
+
+                        protocol = UPP(test, repeats=n_bags_test)
+                        report = qp.evaluation.evaluation_report(
+                            quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True
+                        )
+                        report.to_csv(f'{local_result_path}.dataframe')
+                        print(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
+                        csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
+                        csv.flush()
+
+        show_results(global_result_path)
--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@ -11,12 +11,28 @@ import quapy.functional as F
 from sklearn.metrics.pairwise import rbf_kernel


+# class KDE(KernelDensity):
+#
+#     KERNELS = ['gaussian', 'aitchison']
+#
+#     def __init__(self, bandwidth, kernel):
+#         assert kernel in KDE.KERNELS, f'unknown {kernel=}'
+#         self.bandwidth = bandwidth
+#         self.kernel = kernel
+#
+#     def
+
+
+
+
 class KDEBase:
    """
    Common ancestor for KDE-based methods. Implements some common routines.
    """

    BANDWIDTH_METHOD = ['scott', 'silverman']
+    KERNELS = ['gaussian', 'aitchison']
+

    @classmethod
    def _check_bandwidth(cls, bandwidth):
@ -30,31 +46,62 @@ class KDEBase:
            f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
        if isinstance(bandwidth, float):
            assert 0 < bandwidth < 1,  \
-                "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
+                "the bandwidth for KDEy should be in (0,1), since this method models the unit simplex"
        return bandwidth

-    def get_kde_function(self, X, bandwidth):
+    @classmethod
+    def _check_kernel(cls, kernel):
+        """
+        Checks that the kernel parameter is correct
+
+        :param kernel: str
+        :return: the validated kernel
+        """
+        assert kernel in KDEBase.KERNELS, f'unknown {kernel=}'
+        return kernel
+
+    @classmethod
+    def clr_transform(cls, P, eps=1e-7):
+        """
+        Centered-Log Ratio (CLR) transform.
+        P: array (n_samples, n_classes), every row is a point in the probability simplex
+        eps: smoothing, to avoid log(0)
+        """
+        X_safe = np.clip(P, eps, None)
+        X_safe /= X_safe.sum(axis=1, keepdims=True)  # renormalize
+        gm = np.exp(np.mean(np.log(X_safe), axis=1, keepdims=True))
+        return np.log(X_safe / gm)
+
+    def get_kde_function(self, X, bandwidth, kernel):
        """
        Wraps the KDE function from scikit-learn.

        :param X: data for which the density function is to be estimated
        :param bandwidth: the bandwidth of the kernel
+        :param kernel: the kernel (valid ones are in KDEBase.KERNELS)
        :return: a scikit-learn's KernelDensity object
        """
+        if kernel == 'aitchison':
+            X = KDEBase.clr_transform(X)
+
        return KernelDensity(bandwidth=bandwidth).fit(X)

-    def pdf(self, kde, X):
+    def pdf(self, kde, X, kernel):
        """
        Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this
        function returns :math:`e^{s}`

        :param kde: a previously fit KDE function
        :param X: the data for which the density is to be estimated
+        :param kernel: the kernel (valid ones are in KDEBase.KERNELS)
        :return: np.ndarray with the densities
        """
+        if kernel == 'aitchison':
+            X = KDEBase.clr_transform(X)
+
        return np.exp(kde.score_samples(X))

-    def get_mixture_components(self, X, y, classes, bandwidth):
+    def get_mixture_components(self, X, y, classes, bandwidth, kernel):
        """
        Returns an array containing the mixture components, i.e., the KDE functions for each class.

@ -62,6 +109,7 @@ class KDEBase:
        :param y: the class labels
        :param n_classes: integer, the number of classes
        :param bandwidth: float, the bandwidth of the kernel
+        :param kernel: the kernel (valid ones are in KDEBase.KERNELS)
        :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
        """
        class_cond_X = []
@ -69,8 +117,12 @@ class KDEBase:
            selX = X[y==cat]
            if selX.size==0:
                selX = [F.uniform_prevalence(len(classes))]
+
+            # if kernel == 'aitchison':
+            # this is already done within get_kde_function
+            #     selX = KDEBase.clr_transform(selX)
            class_cond_X.append(selX)
-        return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
+        return [self.get_kde_function(X_cond_yi, bandwidth, kernel) for X_cond_yi in class_cond_X]


 class KDEyML(AggregativeSoftQuantifier, KDEBase):
@ -109,17 +161,19 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
    :param bandwidth: float, the bandwidth of the Kernel
+    :param kernel: kernel of KDE, valid ones are in KDEBase.KERNELS
    :param random_state: a seed to be set before fitting any base quantifier (default None)
    """

-    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1,
+    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian',
                 random_state=None):
        super().__init__(classifier, fit_classifier, val_split)
        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
+        self.kernel = self._check_kernel(kernel)
        self.random_state=random_state

    def aggregation_fit(self, classif_predictions, labels):
-        self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth)
+        self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel)
        return self

    def aggregate(self, posteriors: np.ndarray):
@ -133,7 +187,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
        with qp.util.temp_seed(self.random_state):
            epsilon = 1e-10
            n_classes = len(self.mix_densities)
-            test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
+            test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities]

            def neg_loglikelihood(prev):
                test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))