experiment

2025-09-27 17:41:12 +02:00 · 2025-09-27 17:41:12 +02:00 · c3fd92efde
parent 636e33318f
commit c3fd92efde
4 changed files with 242 additions and 8 deletions
--- a/KDEyAitchison/commons.py
+++ b/KDEyAitchison/commons.py
@ -0,0 +1,52 @@
 import numpy as np
 import pandas as pd
 from quapy.method.aggregative import EMQ, KDEyML
 from sklearn.linear_model import LogisticRegression
 METHODS = ['EMQ',
           # 'KDEy-ML',
           'KDEy-MLA'
           ]
 # common hyperparameterss
 hyper_LR = {
    'classifier__C': np.logspace(-3, 3, 7),
    'classifier__class_weight': ['balanced', None]
 }
 hyper_kde = {
    'bandwidth': np.linspace(0.01, 0.2, 20)
 }
 hyper_kde_aitchison = {
    'bandwidth': np.linspace(0.01, 2, 100)
 }
 # instances a new quantifier based on a string name
 def new_method(method, **lr_kwargs):
    lr = LogisticRegression(**lr_kwargs)
    if method == 'KDEy-ML':
        param_grid = {**hyper_kde, **hyper_LR}
        quantifier = KDEyML(lr, kernel='gaussian')
    elif method == 'KDEy-MLA':
        param_grid = {**hyper_kde_aitchison, **hyper_LR}
        quantifier = KDEyML(lr, kernel='aitchison')
    elif method == 'EMQ':
        param_grid = hyper_LR
        quantifier = EMQ(lr)
    else:
        raise NotImplementedError('unknown method', method)
    return param_grid, quantifier
 def show_results(result_path):
    df = pd.read_csv(result_path+'.csv', sep='\t')
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
    print(pv)
--- a/KDEyAitchison/show_results.py
+++ b/KDEyAitchison/show_results.py
@ -0,0 +1,34 @@
 import pickle
 import os
 import sys
 import pandas as pd
 import quapy as qp
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 from commons import METHODS, new_method, show_results
 SEED = 1
 if __name__ == '__main__':
    print(qp.datasets.UCI_MULTICLASS_DATASETS)
    for optim in ['mae']:
        result_dir = f'results/ucimulti/{optim}'
        for method in METHODS:
            print()
            global_result_path = f'{result_dir}/{method}'
            print(f'Method\tDataset\tMAE\tMRAE\tKLD')
            for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
                print(dataset)
                local_result_path = global_result_path + '_' + dataset
                if os.path.exists(local_result_path + '.dataframe'):
                    report = pd.read_csv(local_result_path+'.dataframe')
                    print(f'{method}\t{dataset}\t{report["mae"].mean():.5f}')
                else:
                    print(dataset, 'not found')
--- a/KDEyAitchison/ucimulti_experiments.py
+++ b/KDEyAitchison/ucimulti_experiments.py
@ -0,0 +1,94 @@
 import pickle
 import os
 import sys
 import pandas as pd
 import quapy as qp
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 from commons import METHODS, new_method, show_results
 SEED = 1
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 500
    qp.environ['N_JOBS'] = -1
    n_bags_val = 250
    n_bags_test = 1000
    for optim in ['mae']:
        result_dir = f'results/ucimulti/{optim}'
        os.makedirs(result_dir, exist_ok=True)
        for method in METHODS:
            print('Init method', method)
            global_result_path = f'{result_dir}/{method}'
            # show_results(global_result_path)
            # sys.exit(0)
            if not os.path.exists(global_result_path + '.csv'):
                with open(global_result_path + '.csv', 'wt') as csv:
                    csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
            with open(global_result_path + '.csv', 'at') as csv:
                for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
                    print('init', dataset)
                    local_result_path = global_result_path + '_' + dataset
                    if os.path.exists(local_result_path + '.dataframe'):
                        print(f'result file {local_result_path}.dataframe already exist; skipping')
                        report = pd.read_csv(local_result_path+'.dataframe')
                        print(report["mae"].mean())
                        # data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
                        # csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
                        continue
                    with qp.util.temp_seed(SEED):
                        param_grid, quantifier = new_method(method, max_iter=3000)
                        data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
                        # model selection
                        train, test = data.train_test
                        train, val = train.split_stratified(random_state=SEED)
                        protocol = UPP(val, repeats=n_bags_val)
                        modsel = GridSearchQ(
                            quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=True, error=optim
                        )
                        try:
                            modsel.fit(*train.Xy)
                            print(f'best params {modsel.best_params_}')
                            print(f'best score {modsel.best_score_}')
                            pickle.dump(
                                (modsel.best_params_, modsel.best_score_,),
                                open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
                            quantifier = modsel.best_model()
                        except:
                            print('something went wrong... trying to fit the default model')
                            quantifier.fit(*train.Xy)
                        protocol = UPP(test, repeats=n_bags_test)
                        report = qp.evaluation.evaluation_report(
                            quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True
                        )
                        report.to_csv(f'{local_result_path}.dataframe')
                        print(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
                        csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
                        csv.flush()
        show_results(global_result_path)
--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@ -11,12 +11,28 @@ import quapy.functional as F
 from sklearn.metrics.pairwise import rbf_kernel
 # class KDE(KernelDensity):
 #
 #     KERNELS = ['gaussian', 'aitchison']
 #
 #     def __init__(self, bandwidth, kernel):
 #         assert kernel in KDE.KERNELS, f'unknown {kernel=}'
 #         self.bandwidth = bandwidth
 #         self.kernel = kernel
 #
 #     def
 class KDEBase:
    """
    Common ancestor for KDE-based methods. Implements some common routines.
    """
    BANDWIDTH_METHOD = ['scott', 'silverman']
    KERNELS = ['gaussian', 'aitchison']
    @classmethod
    def _check_bandwidth(cls, bandwidth):
@ -30,31 +46,62 @@ class KDEBase:
            f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
        if isinstance(bandwidth, float):
            assert 0 < bandwidth < 1,  \
-                "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
+                "the bandwidth for KDEy should be in (0,1), since this method models the unit simplex"
        return bandwidth
-    def get_kde_function(self, X, bandwidth):
+    @classmethod
    def _check_kernel(cls, kernel):
        """
        Checks that the kernel parameter is correct
        :param kernel: str
        :return: the validated kernel
        """
        assert kernel in KDEBase.KERNELS, f'unknown {kernel=}'
        return kernel
    @classmethod
    def clr_transform(cls, P, eps=1e-7):
        """
        Centered-Log Ratio (CLR) transform.
        P: array (n_samples, n_classes), every row is a point in the probability simplex
        eps: smoothing, to avoid log(0)
        """
        X_safe = np.clip(P, eps, None)
        X_safe /= X_safe.sum(axis=1, keepdims=True)  # renormalize
        gm = np.exp(np.mean(np.log(X_safe), axis=1, keepdims=True))
        return np.log(X_safe / gm)
    def get_kde_function(self, X, bandwidth, kernel):
        """
        Wraps the KDE function from scikit-learn.
        :param X: data for which the density function is to be estimated
        :param bandwidth: the bandwidth of the kernel
        :param kernel: the kernel (valid ones are in KDEBase.KERNELS)
        :return: a scikit-learn's KernelDensity object
        """
        if kernel == 'aitchison':
            X = KDEBase.clr_transform(X)
        return KernelDensity(bandwidth=bandwidth).fit(X)
-    def pdf(self, kde, X):
+    def pdf(self, kde, X, kernel):
        """
        Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this
        function returns :math:`e^{s}`
        :param kde: a previously fit KDE function
        :param X: the data for which the density is to be estimated
        :param kernel: the kernel (valid ones are in KDEBase.KERNELS)
        :return: np.ndarray with the densities
        """
        if kernel == 'aitchison':
            X = KDEBase.clr_transform(X)
        return np.exp(kde.score_samples(X))
-    def get_mixture_components(self, X, y, classes, bandwidth):
+    def get_mixture_components(self, X, y, classes, bandwidth, kernel):
        """
        Returns an array containing the mixture components, i.e., the KDE functions for each class.
@ -62,6 +109,7 @@ class KDEBase:
        :param y: the class labels
        :param n_classes: integer, the number of classes
        :param bandwidth: float, the bandwidth of the kernel
        :param kernel: the kernel (valid ones are in KDEBase.KERNELS)
        :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
        """
        class_cond_X = []
@ -69,8 +117,12 @@ class KDEBase:
            selX = X[y==cat]
            if selX.size==0:
                selX = [F.uniform_prevalence(len(classes))]
            # if kernel == 'aitchison':
            # this is already done within get_kde_function
            #     selX = KDEBase.clr_transform(selX)
            class_cond_X.append(selX)
-        return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
+        return [self.get_kde_function(X_cond_yi, bandwidth, kernel) for X_cond_yi in class_cond_X]
 class KDEyML(AggregativeSoftQuantifier, KDEBase):
@ -109,17 +161,19 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
        for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
    :param bandwidth: float, the bandwidth of the Kernel
    :param kernel: kernel of KDE, valid ones are in KDEBase.KERNELS
    :param random_state: a seed to be set before fitting any base quantifier (default None)
    """
-    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1,
+    def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian',
                 random_state=None):
        super().__init__(classifier, fit_classifier, val_split)
        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
        self.kernel = self._check_kernel(kernel)
        self.random_state=random_state
    def aggregation_fit(self, classif_predictions, labels):
-        self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth)
+        self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel)
        return self
    def aggregate(self, posteriors: np.ndarray):
@ -133,7 +187,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
        with qp.util.temp_seed(self.random_state):
            epsilon = 1e-10
            n_classes = len(self.mix_densities)
-            test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
+            test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities]
            def neg_loglikelihood(prev):
                test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))