diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..89cf11c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "result_table"] + path = result_table + url = gitea@gitea-s2i2s.isti.cnr.it:moreo/result_table.git diff --git a/KDEy/constants.py b/KDEy/constants.py new file mode 100644 index 0000000..fafac2d --- /dev/null +++ b/KDEy/constants.py @@ -0,0 +1,2 @@ + +DEBUG = False \ No newline at end of file diff --git a/KDEy/experiments.py b/KDEy/experiments.py index 524e7a3..d952547 100644 --- a/KDEy/experiments.py +++ b/KDEy/experiments.py @@ -5,13 +5,14 @@ import numpy as np from sklearn.linear_model import LogisticRegression from os.path import join import quapy as qp +from quapy.method.aggregative import KDEyML from quapy.protocol import UPP -from kdey_devel import KDEyML -from utils import measuretime +from kdey_devel import KDEyMLauto +from utils import * +from constants import * +import quapy.functional as F -DEBUG = True - qp.environ["SAMPLE_SIZE"] = 100 if DEBUG else 500 val_repeats = 100 if DEBUG else 500 test_repeats = 100 if DEBUG else 500 @@ -27,7 +28,7 @@ if DEBUG: def datasets(): - dataset_list = qp.datasets.UCI_MULTICLASS_DATASETS + dataset_list = qp.datasets.UCI_MULTICLASS_DATASETS[:4] if DEBUG: dataset_list = dataset_list[:4] for dataset_name in dataset_list: @@ -58,11 +59,21 @@ def predict_b_modsel(dataset): # kdey.qua return modsel_choice +@measuretime +def predict_b_kdeymlauto(dataset): + # bandwidth chosen during model selection in validation + train, test = dataset.train_test + kdey = KDEyMLauto(random_state=0) + print(f'true-prevalence: {F.strprev(test.prevalence())}') + chosen_bandwidth, _ = kdey.chose_bandwidth(train, test.X) + auto_bandwidth = float(chosen_bandwidth) + return auto_bandwidth + def in_test_search(dataset, n_jobs=-1): train, test = dataset.train_test - print(f"testing KDEy in {dataset.name}") + print(f"generating true tests scores using KDEy in {dataset.name}") def experiment_job(bandwidth): kdey = KDEyML(bandwidth=bandwidth, random_state=0) @@ -76,50 +87,8 @@ def in_test_search(dataset, n_jobs=-1): return dataset_results, bandwidth_range -def plot_bandwidth(dataset_name, test_results, bandwidths, triplet_list_results): - import matplotlib.pyplot as plt - print("PLOT", dataset_name) - print(dataset_name) - plt.figure(figsize=(8, 6)) - - # show test results - plt.plot(bandwidths, test_results, marker='o') - - for (method_name, method_choice, method_time) in triplet_list_results: - plt.axvline(x=method_choice, linestyle='--', label=method_name) - - # Agregar etiquetas y título - plt.xlabel('Bandwidth') - plt.ylabel('MAE') - plt.title(dataset_name) - - # Mostrar la leyenda - plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) - - # Mostrar la gráfica - plt.grid(True) - - plotdir = './plots' - if DEBUG: - plotdir = './plots_debug' - os.makedirs(plotdir, exist_ok=True) - plt.tight_layout() - plt.savefig(f'{plotdir}/{dataset_name}.png') - plt.close() - -def error_table(dataset_name, test_results, bandwidth_range, triplet_list_results): - best_bandwidth = bandwidth_range[np.argmin(test_results)] - print(f'Method\tChoice\tAE\tTime') - for method_name, method_choice, took in triplet_list_results: - if method_choice in bandwidth_range: - index = np.where(bandwidth_range == method_choice)[0][0] - method_score = test_results[index] - else: - method_score = 1 - error = np.abs(best_bandwidth-method_score) - print(f'{method_name}\t{method_choice}\t{error}\t{took:.3}s') for dataset in datasets(): @@ -138,6 +107,8 @@ for dataset in datasets(): triplet_list_results = [] modsel_choice, modsel_time = qp.util.pickled_resource(join(result_path, 'modsel.pkl'), predict_b_modsel, dataset) triplet_list_results.append(('modsel', modsel_choice, modsel_time,)) + auto_choice, auto_time = qp.util.pickled_resource(join(result_path, 'auto.pkl'), predict_b_kdeymlauto, dataset) + triplet_list_results.append(('auto', auto_choice, auto_time,)) print(f'Dataset = {dataset.name}') print(modsel_choice) diff --git a/KDEy/kdey_devel.py b/KDEy/kdey_devel.py index a9b196b..4cc93cb 100644 --- a/KDEy/kdey_devel.py +++ b/KDEy/kdey_devel.py @@ -1,358 +1,171 @@ -from typing import Union +from typing import Union, Callable import numpy as np from sklearn.base import BaseEstimator from sklearn.neighbors import KernelDensity import quapy as qp from quapy.data import LabelledCollection -from quapy.method.aggregative import AggregativeSoftQuantifier +from quapy.method.aggregative import AggregativeSoftQuantifier, KDEyML import quapy.functional as F from sklearn.metrics.pairwise import rbf_kernel +from scipy import optimize -class KDEBase: - """ - Common ancestor for KDE-based methods. Implements some common routines. - """ - - BANDWIDTH_METHOD = ['scott', 'silverman'] - - @classmethod - def _check_bandwidth(cls, bandwidth): - """ - Checks that the bandwidth parameter is correct - - :param bandwidth: either a string (see BANDWIDTH_METHOD) or a float - :return: the bandwidth if the check is passed, or raises an exception for invalid values - """ - assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ - f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' - if isinstance(bandwidth, float): - assert 0 < bandwidth < 1, \ - "the bandwith for KDEy should be in (0,1), since this method models the unit simplex" - return bandwidth - - def get_kde_function(self, X, bandwidth): - """ - Wraps the KDE function from scikit-learn. - - :param X: data for which the density function is to be estimated - :param bandwidth: the bandwidth of the kernel - :return: a scikit-learn's KernelDensity object - """ - return KernelDensity(bandwidth=bandwidth).fit(X) - - def pdf(self, kde, X): - """ - Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this - function returns :math:`e^{s}` - - :param kde: a previously fit KDE function - :param X: the data for which the density is to be estimated - :return: np.ndarray with the densities - """ - return np.exp(kde.score_samples(X)) - - def get_mixture_components(self, X, y, classes, bandwidth): - """ - Returns an array containing the mixture components, i.e., the KDE functions for each class. - - :param X: the data containing the covariates - :param y: the class labels - :param n_classes: integer, the number of classes - :param bandwidth: float, the bandwidth of the kernel - :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates - """ - class_cond_X = [] - for cat in classes: - selX = X[y == cat] - if selX.size == 0: - selX = [F.uniform_prevalence(len(classes))] - class_cond_X.append(selX) - return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X] -class KDEyML(AggregativeSoftQuantifier, KDEBase): - """ - Kernel Density Estimation model for quantification (KDEy) relying on the Kullback-Leibler divergence (KLD) as - the divergence measure to be minimized. This method was first proposed in the paper - `Kernel Density Estimation for Multiclass Quantification `_, in which - the authors show that minimizing the distribution mathing criterion for KLD is akin to performing - maximum likelihood (ML). - - The distribution matching optimization problem comes down to solving: - - :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})` - - where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence) - :math:`\\alpha` defined by - - :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})` - - where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the - KDE function that uses the datapoints in X as the kernel centers. - - In KDEy-ML, the divergence is taken to be the Kullback-Leibler Divergence. This is equivalent to solving: - :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} - - \\mathbb{E}_{q_{\\widetilde{U}}} \\left[ \\log \\boldsymbol{p}_{\\alpha}(\\widetilde{x}) \\right]` - - which corresponds to the maximum likelihood estimate. - - :param classifier: a sklearn's Estimator that generates a binary classifier. - :param val_split: specifies the data used for generating classifier predictions. This specification - can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to - be extracted from the training set; or as an integer (default 5), indicating that the predictions - are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value - for `k`); or as a collection defining the specific set of data to use for validation. - Alternatively, this set can be specified at fit time by indicating the exact set of data - on which the predictions are to be generated. - :param bandwidth: float, the bandwidth of the Kernel - :param random_state: a seed to be set before fitting any base quantifier (default None) - """ - - def __init__(self, classifier: BaseEstimator = None, val_split=5, bandwidth=0.1, random_state=None): +class KDEyMLauto(KDEyML): + def __init__(self, classifier: BaseEstimator = None, val_split=5, random_state=None, optim='two_steps'): self.classifier = qp._get_classifier(classifier) self.val_split = val_split - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = None self.random_state = random_state + self.optim = optim - def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): - self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth) - return self + def chose_bandwidth(self, train, test_instances): + classif_predictions = self.classifier_fit_predict(train, fit_classifier=True, predict_on=self.val_split) + te_posteriors = self.classify(test_instances) + return self.transduce(classif_predictions, te_posteriors) - def aggregate(self, posteriors: np.ndarray): - """ - Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood - of the data (i.e., that minimizes the negative log-likelihood) + def transduce(self, classif_predictions, te_posteriors): + tr_posteriors, tr_y = classif_predictions.Xy + classes = classif_predictions.classes_ + n_classes = len(classes) - :param posteriors: instances in the sample converted into posterior probabilities - :return: a vector of class prevalence estimates - """ + current_bandwidth = 0.05 + if self.optim == 'both_fine': + current_bandwidth = np.full(fill_value=current_bandwidth, shape=(n_classes,)) + current_prevalence = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + + iterations = 0 + convergence = False with qp.util.temp_seed(self.random_state): - epsilon = 1e-10 - n_classes = len(self.mix_densities) - test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities] - def neg_loglikelihood(prev): - test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) - test_loglikelihood = np.log(test_mixture_likelihood + epsilon) - return -np.sum(test_loglikelihood) + while not convergence: + previous_bandwidth = current_bandwidth + previous_prevalence = current_prevalence - return F.optim_minimize(neg_loglikelihood, n_classes) + iterations += 1 + print(f'{iterations}:') + if self.optim == 'two_steps': + current_prevalence = self.optim_minimize_prevalence(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) + print(f'\testim-prev={F.strprev(current_prevalence)}') -class KDEyHD(AggregativeSoftQuantifier, KDEBase): - """ - Kernel Density Estimation model for quantification (KDEy) relying on the squared Hellinger Disntace (HD) as - the divergence measure to be minimized. This method was first proposed in the paper - `Kernel Density Estimation for Multiclass Quantification `_, in which - the authors proposed a Monte Carlo approach for minimizing the divergence. + current_bandwidth = self.optim_minimize_bandwidth(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) + print(f'\tbandwidth={current_bandwidth}') + if np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) and all( + np.isclose(previous_prevalence, current_prevalence, atol=0.0001)): + convergence = True + elif self.optim == 'both': + current_prevalence, current_bandwidth = self.optim_minimize_both(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) + if np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) and all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)): + convergence = True + elif self.optim == 'both_fine': + current_prevalence, current_bandwidth = self.optim_minimize_both_fine(current_bandwidth, current_prevalence, tr_posteriors, tr_y, + te_posteriors, classes) - The distribution matching optimization problem comes down to solving: + if all(np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001)) and all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)): + convergence = True - :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})` - - where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence) - :math:`\\alpha` defined by - - :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})` - - where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the - KDE function that uses the datapoints in X as the kernel centers. - - In KDEy-HD, the divergence is taken to be the squared Hellinger Distance, an f-divergence with corresponding - f-generator function given by: - - :math:`f(u)=(\\sqrt{u}-1)^2` - - The authors proposed a Monte Carlo solution that relies on importance sampling: - - :math:`\\hat{D}_f(p||q)= \\frac{1}{t} \\sum_{i=1}^t f\\left(\\frac{p(x_i)}{q(x_i)}\\right) \\frac{q(x_i)}{r(x_i)}` - - where the datapoints (trials) :math:`x_1,\\ldots,x_t\\sim_{\\mathrm{iid}} r` with :math:`r` the - uniform distribution. - - :param classifier: a sklearn's Estimator that generates a binary classifier. - :param val_split: specifies the data used for generating classifier predictions. This specification - can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to - be extracted from the training set; or as an integer (default 5), indicating that the predictions - are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value - for `k`); or as a collection defining the specific set of data to use for validation. - Alternatively, this set can be specified at fit time by indicating the exact set of data - on which the predictions are to be generated. - :param bandwidth: float, the bandwidth of the Kernel - :param random_state: a seed to be set before fitting any base quantifier (default None) - :param montecarlo_trials: number of Monte Carlo trials (default 10000) - """ - - def __init__(self, classifier: BaseEstimator = None, val_split=5, divergence: str = 'HD', - bandwidth=0.1, random_state=None, montecarlo_trials=10000): - - self.classifier = qp._get_classifier(classifier) - self.val_split = val_split - self.divergence = divergence - self.bandwidth = KDEBase._check_bandwidth(bandwidth) - self.random_state = random_state - self.montecarlo_trials = montecarlo_trials - - def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): - self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth) - - N = self.montecarlo_trials - rs = self.random_state - n = data.n_classes - self.reference_samples = np.vstack([kde_i.sample(N // n, random_state=rs) for kde_i in self.mix_densities]) - self.reference_classwise_densities = np.asarray( - [self.pdf(kde_j, self.reference_samples) for kde_j in self.mix_densities]) - self.reference_density = np.mean(self.reference_classwise_densities, - axis=0) # equiv. to (uniform @ self.reference_classwise_densities) - - return self - - def aggregate(self, posteriors: np.ndarray): - # we retain all n*N examples (sampled from a mixture with uniform parameter), and then - # apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS - n_classes = len(self.mix_densities) - - test_kde = self.get_kde_function(posteriors, self.bandwidth) - test_densities = self.pdf(test_kde, self.reference_samples) - - def f_squared_hellinger(u): - return (np.sqrt(u) - 1) ** 2 - - # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway - if self.divergence.lower() == 'hd': - f = f_squared_hellinger - else: - raise ValueError('only squared HD is currently implemented') + self.bandwidth = current_bandwidth + print('bandwidth=', current_bandwidth) + print('prevalence=', current_prevalence) + return current_prevalence + def optim_minimize_prevalence(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes): epsilon = 1e-10 - qs = test_densities + epsilon - rs = self.reference_density + epsilon - iw = qs / rs # importance weights - p_class = self.reference_classwise_densities + epsilon - fracs = p_class / qs + mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, current_bandwidth) + test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities] - def divergence(prev): - # ps / qs = (prev @ p_class) / qs = prev @ (p_class / qs) = prev @ fracs - ps_div_qs = prev @ fracs - return np.mean(f(ps_div_qs) * iw) + def neg_loglikelihood_prev(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) - return F.optim_minimize(divergence, n_classes) + return optim_minimize(neg_loglikelihood_prev, current_prev) + def optim_minimize_bandwidth(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes): + epsilon = 1e-10 + def neg_loglikelihood_bandwidth(bandwidth): + mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth[0]) + test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities] + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(current_prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) -class KDEyCS(AggregativeSoftQuantifier): - """ - Kernel Density Estimation model for quantification (KDEy) relying on the Cauchy-Schwarz divergence (CS) as - the divergence measure to be minimized. This method was first proposed in the paper - `Kernel Density Estimation for Multiclass Quantification `_, in which - the authors proposed a Monte Carlo approach for minimizing the divergence. + bounds = [(0.00001, 1)] + r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds) + print(f'iterations-bandwidth={r.nit}') + return r.x[0] - The distribution matching optimization problem comes down to solving: + def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes): + epsilon = 1e-10 + n_classes = len(current_prev) + def neg_loglikelihood_bandwidth(prevalence_bandwidth): + bandwidth = prevalence_bandwidth[-1] + prevalence = prevalence_bandwidth[:-1] + mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth) + test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities] + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prevalence, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) - :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})` + bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1)] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])}) + prevalence_bandwidth = np.append(current_prev, current_bandwidth) + r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints) + print(f'iterations-both={r.nit}') + prev_band = r.x + current_prevalence = prev_band[:-1] + current_bandwidth = prev_band[-1] + return current_prevalence, current_bandwidth - where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence) - :math:`\\alpha` defined by + def optim_minimize_both_fine(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes): + epsilon = 1e-10 + n_classes = len(current_bandwidth) + def neg_loglikelihood_bandwidth(prevalence_bandwidth): + prevalence = prevalence_bandwidth[:n_classes] + bandwidth = prevalence_bandwidth[n_classes:] + mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth) + test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities] + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prevalence, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) - :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})` - - where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the - KDE function that uses the datapoints in X as the kernel centers. - - In KDEy-CS, the divergence is taken to be the Cauchy-Schwarz divergence given by: - - :math:`\\mathcal{D}_{\\mathrm{CS}}(p||q)=-\\log\\left(\\frac{\\int p(x)q(x)dx}{\\sqrt{\\int p(x)^2dx \\int q(x)^2dx}}\\right)` - - The authors showed that this distribution matching admits a closed-form solution - - :param classifier: a sklearn's Estimator that generates a binary classifier. - :param val_split: specifies the data used for generating classifier predictions. This specification - can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to - be extracted from the training set; or as an integer (default 5), indicating that the predictions - are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value - for `k`); or as a collection defining the specific set of data to use for validation. - Alternatively, this set can be specified at fit time by indicating the exact set of data - on which the predictions are to be generated. - :param bandwidth: float, the bandwidth of the Kernel - """ - - def __init__(self, classifier: BaseEstimator = None, val_split=5, bandwidth=0.1): - self.classifier = qp._get_classifier(classifier) - self.val_split = val_split - self.bandwidth = KDEBase._check_bandwidth(bandwidth) - - def gram_matrix_mix_sum(self, X, Y=None): - # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y)) - # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are - # two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth) - h = self.bandwidth - variance = 2 * (h ** 2) - nD = X.shape[1] - gamma = 1 / (2 * variance) - norm_factor = 1 / np.sqrt(((2 * np.pi) ** nD) * (variance ** (nD))) - gram = norm_factor * rbf_kernel(X, Y, gamma=gamma) - return gram.sum() + bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1) for _ in range(n_classes)] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])}) + prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth)) + r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints) + print(f'iterations-both-fine={r.nit}') + prev_band = r.x + current_prevalence = prev_band[:n_classes] + current_bandwidth = prev_band[n_classes:] + return current_prevalence, current_bandwidth def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): - - P, y = classif_predictions.Xy - n = data.n_classes - - assert all(sorted(np.unique(y)) == np.arange(n)), \ - 'label name gaps not allowed in current implementation' - - # counts_inv keeps track of the relative weight of each datapoint within its class - # (i.e., the weight in its KDE model) - counts_inv = 1 / (data.counts()) - - # tr_tr_sums corresponds to symbol \overline{B} in the paper - tr_tr_sums = np.zeros(shape=(n, n), dtype=float) - for i in range(n): - for j in range(n): - if i > j: - tr_tr_sums[i, j] = tr_tr_sums[j, i] - else: - block = self.gram_matrix_mix_sum(P[y == i], P[y == j] if i != j else None) - tr_tr_sums[i, j] = block - - # keep track of these data structures for the test phase - self.Ptr = P - self.ytr = y - self.tr_tr_sums = tr_tr_sums - self.counts_inv = counts_inv - + self.classif_predictions = classif_predictions return self def aggregate(self, posteriors: np.ndarray): - Ptr = self.Ptr - Pte = posteriors - y = self.ytr - tr_tr_sums = self.tr_tr_sums + return self.transduce(self.classif_predictions, posteriors) - M, nD = Pte.shape - Minv = (1 / M) # t in the paper - n = Ptr.shape[1] - # becomes a constant that does not affect the optimization, no need to compute it - # partC = 0.5*np.log(self.gram_matrix_mix_sum(Pte) * Kinv * Kinv) +def optim_minimize(loss: Callable, init_prev: np.ndarray): + """ + Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex + that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's + SLSQP routine. - # tr_te_sums corresponds to \overline{a}*(1/Li)*(1/M) in the paper (note the constants - # are already aggregated to tr_te_sums, so these multiplications are not carried out - # at each iteration of the optimization phase) - tr_te_sums = np.zeros(shape=n, dtype=float) - for i in range(n): - tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y == i], Pte) + :param loss: (callable) the function to minimize + :return: (ndarray) the best prevalence vector found + """ - def divergence(alpha): - # called \overline{r} in the paper - alpha_ratio = alpha * self.counts_inv - - # recall that tr_te_sums already accounts for the constant terms (1/Li)*(1/M) - partA = -np.log((alpha_ratio @ tr_te_sums) * Minv) - partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio) - return partA + partB # + partC - - return F.optim_minimize(divergence, n) + n_classes = len(init_prev) + # solutions are bounded to those contained in the unit-simplex + bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + r = optimize.minimize(loss, x0=init_prev, method='SLSQP', bounds=bounds, constraints=constraints) + print(f'iterations-prevalence={r.nit}') + return r.x diff --git a/KDEy/quantification_evaluation.py b/KDEy/quantification_evaluation.py new file mode 100644 index 0000000..6bc2c43 --- /dev/null +++ b/KDEy/quantification_evaluation.py @@ -0,0 +1,156 @@ +import pickle +import os +from time import time +from collections import defaultdict + +import numpy as np +from sklearn.linear_model import LogisticRegression + +import quapy as qp +from KDEy.kdey_devel import KDEyMLauto +from quapy.method.aggregative import PACC, EMQ, KDEyML +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP +from pathlib import Path + +SEED = 1 + + +def newLR(): + return LogisticRegression(max_iter=3000) + + +# typical hyperparameters explored for Logistic Regression +logreg_grid = { + 'C': [1], + 'class_weight': [None] +} + + +def wrap_hyper(classifier_hyper_grid: dict): + return {'classifier__' + k: v for k, v in classifier_hyper_grid.items()} + + +METHODS = [ + ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), + ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)), + ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}), +] + + +""" +TKDEyML era primero bandwidth (init 0.05) y luego prevalence (init uniform) +TKDEyML2 era primero prevalence (init uniform) y luego bandwidth (init 0.05) +TKDEyML3 era primero prevalence (init uniform) y luego bandwidth (init 0.1) +TKDEyML4 es como ML2 pero max 5 iteraciones por optimización +""" +TRANSDUCTIVE_METHODS = [ + #('TKDEy-ML', KDEyMLauto(newLR()), None), + ('TKDEy-MLboth', KDEyMLauto(newLR(), optim='both'), None), + ('TKDEy-MLbothfine', KDEyMLauto(newLR(), optim='both_fine'), None), + ('TKDEy-ML2', KDEyMLauto(newLR()), None), + #('TKDEy-ML3', KDEyMLauto(newLR()), None), + #('TKDEy-ML4', KDEyMLauto(newLR()), None), +] + +def show_results(result_path): + import pandas as pd + df = pd.read_csv(result_path + '.csv', sep='\t') + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE", "t_train"], margins=True) + print(pv) + + +def load_timings(result_path): + import pandas as pd + timings = defaultdict(lambda: {}) + if not Path(result_path + '.csv').exists(): + return timings + + df = pd.read_csv(result_path + '.csv', sep='\t') + return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict() + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 500 + qp.environ['N_JOBS'] = -1 + n_bags_val = 25 + n_bags_test = 100 + result_dir = f'results_quantification/ucimulti' + + os.makedirs(result_dir, exist_ok=True) + + global_result_path = f'{result_dir}/allmethods' + timings = load_timings(global_result_path) + with open(global_result_path + '.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\tt_train\n') + + for method_name, quantifier, param_grid in METHODS + TRANSDUCTIVE_METHODS: + + print('Init method', method_name) + + with open(global_result_path + '.csv', 'at') as csv: + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS[:4]: + print('init', dataset) + + local_result_path = os.path.join(Path(global_result_path).parent, + method_name + '_' + dataset + '.dataframe') + + if os.path.exists(local_result_path): + print(f'result file {local_result_path} already exist; skipping') + report = qp.util.load_report(local_result_path) + + else: + with qp.util.temp_seed(SEED): + + data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True) + + if not method_name.startswith("TKDEy-ML"): + # model selection + train, test = data.train_test + train, val = train.split_stratified(random_state=SEED) + + protocol = UPP(val, repeats=n_bags_val) + modsel = GridSearchQ( + quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae' + ) + + t_init = time() + try: + modsel.fit(train) + + print(f'best params {modsel.best_params_}') + print(f'best score {modsel.best_score_}') + + quantifier = modsel.best_model() + except: + print('something went wrong... trying to fit the default model') + quantifier.fit(train) + timings[method_name][dataset] = time() - t_init + + protocol = UPP(test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report( + quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True + ) + report.to_csv(local_result_path) + else: + # model selection + train, test = data.train_test + t_init = time() + quantifier.fit(train) + timings[method_name][dataset] = time() - t_init + + protocol = UPP(test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report( + quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True + ) + report.to_csv(local_result_path) + + means = report.mean(numeric_only=True) + csv.write( + f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')#\t{timings[method_name][dataset]:.3f}\n') + csv.flush() + + show_results(global_result_path) \ No newline at end of file diff --git a/KDEy/utils.py b/KDEy/utils.py index c673378..382727e 100644 --- a/KDEy/utils.py +++ b/KDEy/utils.py @@ -1,5 +1,10 @@ import time from functools import wraps +import os +from os.path import join +from result_table.src.table import Table +import numpy as np +from constants import * def measuretime(func): @wraps(func) @@ -12,4 +17,65 @@ def measuretime(func): return (*result, time_it_took) else: return result, time_it_took - return wrapper \ No newline at end of file + return wrapper + + +def plot_bandwidth(dataset_name, test_results, bandwidths, triplet_list_results): + import matplotlib.pyplot as plt + + print("PLOT", dataset_name) + print(dataset_name) + + plt.figure(figsize=(8, 6)) + + # show test results + plt.plot(bandwidths, test_results, marker='o', color='k') + + colors = plt.cm.tab10(np.linspace(0, 1, len(triplet_list_results))) + for i, (method_name, method_choice, method_time) in enumerate(triplet_list_results): + plt.axvline(x=method_choice, linestyle='--', label=method_name, color=colors[i]) + + # Agregar etiquetas y título + plt.xlabel('Bandwidth') + plt.ylabel('MAE') + plt.title(dataset_name) + + # Mostrar la leyenda + plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) + + # Mostrar la gráfica + plt.grid(True) + + plotdir = './plots' + if DEBUG: + plotdir = './plots_debug' + os.makedirs(plotdir, exist_ok=True) + plt.tight_layout() + plt.savefig(f'{plotdir}/{dataset_name}.png') + plt.close() + +def error_table(dataset_name, test_results, bandwidth_range, triplet_list_results): + best_bandwidth = bandwidth_range[np.argmin(test_results)] + best_score = np.min(test_results) + print(f'Method\tChoice\tAE\tTime') + table=Table(name=dataset_name) + table.format.with_mean=False + table.format.with_rank_mean = False + table.format.show_std = False + for method_name, method_choice, took in triplet_list_results: + if method_choice in bandwidth_range: + index = np.where(bandwidth_range == method_choice)[0][0] + method_score = test_results[index] + else: + method_score = 1 + error = np.abs(best_score-method_score) + table.add(benchmark='Choice', method=method_name, v=method_choice) + table.add(benchmark='ScoreChoice', method=method_name, v=method_score) + table.add(benchmark='Best', method=method_name, v=best_bandwidth) + table.add(benchmark='ScoreBest', method=method_name, v=best_score) + table.add(benchmark='AE', method=method_name, v=error) + table.add(benchmark='Time', method=method_name, v=took) + outpath = './tables' + if DEBUG: + outpath = './tables_debug' + table.latexPDF(join(outpath, dataset_name+'.pdf'), transpose=True) \ No newline at end of file diff --git a/quapy/__init__.py b/quapy/__init__.py index db34bbb..44a23a4 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -14,7 +14,7 @@ from . import model_selection from . import classification import os -__version__ = '0.1.9' +__version__ = '0.1.10' environ = { 'SAMPLE_SIZE': None, diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 5582a58..7d1823d 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -3,6 +3,7 @@ from contextlib import contextmanager import zipfile from os.path import join import pandas as pd +import sklearn.datasets from ucimlrepo import fetch_ucirepo from quapy.data.base import Dataset, LabelledCollection from quapy.data.preprocessing import text2tfidf, reduce_columns @@ -1004,3 +1005,49 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No return train, test_gen else: return train_gen, test_gen + + +def syntheticUniformLabelledCollection(n_samples, n_features, n_classes, n_clusters_per_class=1, **kwargs): + """ + Generates a synthetic labelled collection with uniform priors and + of `n_samples` instances, `n_features` features, and `n_classes` classes. + The underlying generator relies on the function + `sklearn.datasets.make_classification`. Other options can be specified using the `kwargs`; + see the `scikit-learn documentation + `_ + for a full list of optional parameters. + + :param n_samples: number of instances + :param n_features: number of features + :param n_classes: number of classes + """ + X, y = sklearn.datasets.make_classification( + n_samples=n_samples, + n_features=n_features, + n_classes=n_classes, + n_clusters_per_class=n_clusters_per_class, + **kwargs + ) + + return LabelledCollection(X, y) + +def syntheticUniformDataset(n_samples, n_features, n_classes, test_split=0.3, **kwargs): + """ + Generates a synthetic Dataset with approximately uniform priors and + of `n_samples` instances, `n_features` features, and `n_classes` classes. + The underlying generator relies on the function + `sklearn.datasets.make_classification`. Other options can be specified using the `kwargs`; + see the `scikit-learn documentation + `_ + for a full list of optional parameters. + + :param n_samples: number of instances + :param n_features: number of features + :param n_classes: number of classes + :param test_split: proportion of test instances + """ + assert 0. < test_split < 1., "invalid proportion of test instances; the value must be in (0, 1)" + lc = syntheticUniformLabelledCollection(n_samples, n_features, n_classes, **kwargs) + training, test = lc.split_stratified(train_prop=1-test_split, random_state=kwargs.get('random_state', None)) + dataset = Dataset(training=training, test=test, name=f'synthetic(nF={n_features},nC={n_classes})') + return dataset \ No newline at end of file diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index e50c99c..f2b9465 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -66,11 +66,13 @@ class KDEBase: """ class_cond_X = [] for cat in classes: - selX = X[y==cat] - if selX.size==0: + selX = X[y == cat] + if selX.size == 0: selX = [F.uniform_prevalence(len(classes))] class_cond_X.append(selX) - return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X] + if isinstance(bandwidth, float): + bandwidth = np.full(fill_value=bandwidth, shape=(len(classes),)) + return [self.get_kde_function(X_cond_yi, band_i) for X_cond_yi, band_i in zip(class_cond_X, bandwidth)] class KDEyML(AggregativeSoftQuantifier, KDEBase): @@ -188,7 +190,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase): def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD', bandwidth=0.1, random_state=None, montecarlo_trials=10000): - + self.classifier = qp._get_classifier(classifier) self.val_split = val_split self.divergence = divergence @@ -218,7 +220,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase): def f_squared_hellinger(u): return (np.sqrt(u)-1)**2 - + # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway if self.divergence.lower() == 'hd': f = f_squared_hellinger @@ -283,7 +285,7 @@ class KDEyCS(AggregativeSoftQuantifier): def gram_matrix_mix_sum(self, X, Y=None): # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y)) - # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are + # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are # two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth) h = self.bandwidth variance = 2 * (h**2) @@ -342,7 +344,7 @@ class KDEyCS(AggregativeSoftQuantifier): # at each iteration of the optimization phase) tr_te_sums = np.zeros(shape=n, dtype=float) for i in range(n): - tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte) + tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte) def divergence(alpha): # called \overline{r} in the paper diff --git a/result_table b/result_table new file mode 160000 index 0000000..c223c9f --- /dev/null +++ b/result_table @@ -0,0 +1 @@ +Subproject commit c223c9f1fe3c9708e8c5a5c56e438cdaaa857be4