diff --git a/KDEyAitchison/commons.py b/KDEyAitchison/commons.py new file mode 100644 index 0000000..0beaea0 --- /dev/null +++ b/KDEyAitchison/commons.py @@ -0,0 +1,52 @@ +import numpy as np +import pandas as pd + +from quapy.method.aggregative import EMQ, KDEyML +from sklearn.linear_model import LogisticRegression + +METHODS = ['EMQ', + # 'KDEy-ML', + 'KDEy-MLA' + ] + + +# common hyperparameterss +hyper_LR = { + 'classifier__C': np.logspace(-3, 3, 7), + 'classifier__class_weight': ['balanced', None] +} + +hyper_kde = { + 'bandwidth': np.linspace(0.01, 0.2, 20) +} + +hyper_kde_aitchison = { + 'bandwidth': np.linspace(0.01, 2, 100) +} + +# instances a new quantifier based on a string name +def new_method(method, **lr_kwargs): + lr = LogisticRegression(**lr_kwargs) + + if method == 'KDEy-ML': + param_grid = {**hyper_kde, **hyper_LR} + quantifier = KDEyML(lr, kernel='gaussian') + elif method == 'KDEy-MLA': + param_grid = {**hyper_kde_aitchison, **hyper_LR} + quantifier = KDEyML(lr, kernel='aitchison') + elif method == 'EMQ': + param_grid = hyper_LR + quantifier = EMQ(lr) + else: + raise NotImplementedError('unknown method', method) + + return param_grid, quantifier + + +def show_results(result_path): + df = pd.read_csv(result_path+'.csv', sep='\t') + + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"]) + print(pv) \ No newline at end of file diff --git a/KDEyAitchison/show_results.py b/KDEyAitchison/show_results.py new file mode 100644 index 0000000..db63e64 --- /dev/null +++ b/KDEyAitchison/show_results.py @@ -0,0 +1,34 @@ +import pickle +import os +import sys + +import pandas as pd + +import quapy as qp +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP +from commons import METHODS, new_method, show_results + + +SEED = 1 + + + +if __name__ == '__main__': + print(qp.datasets.UCI_MULTICLASS_DATASETS) + for optim in ['mae']: + result_dir = f'results/ucimulti/{optim}' + + for method in METHODS: + print() + global_result_path = f'{result_dir}/{method}' + print(f'Method\tDataset\tMAE\tMRAE\tKLD') + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: + print(dataset) + local_result_path = global_result_path + '_' + dataset + if os.path.exists(local_result_path + '.dataframe'): + report = pd.read_csv(local_result_path+'.dataframe') + print(f'{method}\t{dataset}\t{report["mae"].mean():.5f}') + else: + print(dataset, 'not found') + diff --git a/KDEyAitchison/ucimulti_experiments.py b/KDEyAitchison/ucimulti_experiments.py new file mode 100644 index 0000000..b882380 --- /dev/null +++ b/KDEyAitchison/ucimulti_experiments.py @@ -0,0 +1,94 @@ +import pickle +import os +import sys + +import pandas as pd + +import quapy as qp +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP +from commons import METHODS, new_method, show_results + + +SEED = 1 + + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 500 + qp.environ['N_JOBS'] = -1 + n_bags_val = 250 + n_bags_test = 1000 + for optim in ['mae']: + result_dir = f'results/ucimulti/{optim}' + + os.makedirs(result_dir, exist_ok=True) + + for method in METHODS: + + print('Init method', method) + + global_result_path = f'{result_dir}/{method}' + # show_results(global_result_path) + # sys.exit(0) + + if not os.path.exists(global_result_path + '.csv'): + with open(global_result_path + '.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') + + with open(global_result_path + '.csv', 'at') as csv: + + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: + + print('init', dataset) + + local_result_path = global_result_path + '_' + dataset + if os.path.exists(local_result_path + '.dataframe'): + print(f'result file {local_result_path}.dataframe already exist; skipping') + report = pd.read_csv(local_result_path+'.dataframe') + print(report["mae"].mean()) + # data = qp.datasets.fetch_UCIMulticlassDataset(dataset) + # csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n') + continue + + with qp.util.temp_seed(SEED): + + param_grid, quantifier = new_method(method, max_iter=3000) + + data = qp.datasets.fetch_UCIMulticlassDataset(dataset) + + # model selection + train, test = data.train_test + train, val = train.split_stratified(random_state=SEED) + + protocol = UPP(val, repeats=n_bags_val) + modsel = GridSearchQ( + quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=True, error=optim + ) + + try: + modsel.fit(*train.Xy) + + print(f'best params {modsel.best_params_}') + print(f'best score {modsel.best_score_}') + pickle.dump( + (modsel.best_params_, modsel.best_score_,), + open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) + + quantifier = modsel.best_model() + except: + print('something went wrong... trying to fit the default model') + quantifier.fit(*train.Xy) + + + protocol = UPP(test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report( + quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True + ) + report.to_csv(f'{local_result_path}.dataframe') + print(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n') + csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n') + csv.flush() + + show_results(global_result_path) \ No newline at end of file diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index 4208e4d..5932c06 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -11,12 +11,28 @@ import quapy.functional as F from sklearn.metrics.pairwise import rbf_kernel +# class KDE(KernelDensity): +# +# KERNELS = ['gaussian', 'aitchison'] +# +# def __init__(self, bandwidth, kernel): +# assert kernel in KDE.KERNELS, f'unknown {kernel=}' +# self.bandwidth = bandwidth +# self.kernel = kernel +# +# def + + + + class KDEBase: """ Common ancestor for KDE-based methods. Implements some common routines. """ BANDWIDTH_METHOD = ['scott', 'silverman'] + KERNELS = ['gaussian', 'aitchison'] + @classmethod def _check_bandwidth(cls, bandwidth): @@ -30,31 +46,62 @@ class KDEBase: f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' if isinstance(bandwidth, float): assert 0 < bandwidth < 1, \ - "the bandwith for KDEy should be in (0,1), since this method models the unit simplex" + "the bandwidth for KDEy should be in (0,1), since this method models the unit simplex" return bandwidth - def get_kde_function(self, X, bandwidth): + @classmethod + def _check_kernel(cls, kernel): + """ + Checks that the kernel parameter is correct + + :param kernel: str + :return: the validated kernel + """ + assert kernel in KDEBase.KERNELS, f'unknown {kernel=}' + return kernel + + @classmethod + def clr_transform(cls, P, eps=1e-7): + """ + Centered-Log Ratio (CLR) transform. + P: array (n_samples, n_classes), every row is a point in the probability simplex + eps: smoothing, to avoid log(0) + """ + X_safe = np.clip(P, eps, None) + X_safe /= X_safe.sum(axis=1, keepdims=True) # renormalize + gm = np.exp(np.mean(np.log(X_safe), axis=1, keepdims=True)) + return np.log(X_safe / gm) + + def get_kde_function(self, X, bandwidth, kernel): """ Wraps the KDE function from scikit-learn. :param X: data for which the density function is to be estimated :param bandwidth: the bandwidth of the kernel + :param kernel: the kernel (valid ones are in KDEBase.KERNELS) :return: a scikit-learn's KernelDensity object """ + if kernel == 'aitchison': + X = KDEBase.clr_transform(X) + return KernelDensity(bandwidth=bandwidth).fit(X) - def pdf(self, kde, X): + def pdf(self, kde, X, kernel): """ Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this function returns :math:`e^{s}` :param kde: a previously fit KDE function :param X: the data for which the density is to be estimated + :param kernel: the kernel (valid ones are in KDEBase.KERNELS) :return: np.ndarray with the densities """ + if kernel == 'aitchison': + X = KDEBase.clr_transform(X) + return np.exp(kde.score_samples(X)) - def get_mixture_components(self, X, y, classes, bandwidth): + def get_mixture_components(self, X, y, classes, bandwidth, kernel): """ Returns an array containing the mixture components, i.e., the KDE functions for each class. @@ -62,6 +109,7 @@ class KDEBase: :param y: the class labels :param n_classes: integer, the number of classes :param bandwidth: float, the bandwidth of the kernel + :param kernel: the kernel (valid ones are in KDEBase.KERNELS) :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates """ class_cond_X = [] @@ -69,8 +117,12 @@ class KDEBase: selX = X[y==cat] if selX.size==0: selX = [F.uniform_prevalence(len(classes))] + + # if kernel == 'aitchison': + # this is already done within get_kde_function + # selX = KDEBase.clr_transform(selX) class_cond_X.append(selX) - return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X] + return [self.get_kde_function(X_cond_yi, bandwidth, kernel) for X_cond_yi in class_cond_X] class KDEyML(AggregativeSoftQuantifier, KDEBase): @@ -109,17 +161,19 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. :param bandwidth: float, the bandwidth of the Kernel + :param kernel: kernel of KDE, valid ones are in KDEBase.KERNELS :param random_state: a seed to be set before fitting any base quantifier (default None) """ - def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian', random_state=None): super().__init__(classifier, fit_classifier, val_split) self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.kernel = self._check_kernel(kernel) self.random_state=random_state def aggregation_fit(self, classif_predictions, labels): - self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth) + self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel) return self def aggregate(self, posteriors: np.ndarray): @@ -133,7 +187,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): with qp.util.temp_seed(self.random_state): epsilon = 1e-10 n_classes = len(self.mix_densities) - test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities] + test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities] def neg_loglikelihood(prev): test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))