From 23608f20382ed82a75e1afbbfff951167615ba59 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 4 Dec 2025 10:24:02 +0100 Subject: [PATCH] adding exploration in CLR --- BayesianKDEy/TODO.txt | 3 + BayesianKDEy/_bayeisan_kdey.py | 35 +++++-- BayesianKDEy/full_experiments.py | 128 ++++++++++++++---------- BayesianKDEy/generate_results.py | 103 +++++++++++++++++-- BayesianKDEy/single_experiment_debug.py | 95 ++++++++++++++++++ quapy/functional.py | 4 +- quapy/method/_kdey.py | 13 +-- quapy/method/confidence.py | 44 +++++++- quapy/model_selection.py | 2 +- 9 files changed, 343 insertions(+), 84 deletions(-) create mode 100644 BayesianKDEy/TODO.txt create mode 100644 BayesianKDEy/single_experiment_debug.py diff --git a/BayesianKDEy/TODO.txt b/BayesianKDEy/TODO.txt new file mode 100644 index 0000000..ac989dd --- /dev/null +++ b/BayesianKDEy/TODO.txt @@ -0,0 +1,3 @@ +- Add other methods that natively provide uncertainty quantification methods? +- Explore neighbourhood in the CLR space instead than in the simplex! +- \ No newline at end of file diff --git a/BayesianKDEy/_bayeisan_kdey.py b/BayesianKDEy/_bayeisan_kdey.py index d586bef..16401bf 100644 --- a/BayesianKDEy/_bayeisan_kdey.py +++ b/BayesianKDEy/_bayeisan_kdey.py @@ -1,7 +1,7 @@ from sklearn.base import BaseEstimator import numpy as np from quapy.method._kdey import KDEBase -from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC +from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC, CLRtransformation from quapy.method.aggregative import AggregativeSoftQuantifier from tqdm import tqdm import quapy.functional as F @@ -40,6 +40,8 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): mcmc_seed: int = 0, confidence_level: float = 0.95, region: str = 'intervals', + explore_CLR=False, + step_size=0.05, verbose: bool = False): if num_warmup <= 0: @@ -48,13 +50,15 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): raise ValueError(f'parameter {num_samples=} must be a positive integer') super().__init__(classifier, fit_classifier, val_split) - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel) self.kernel = self._check_kernel(kernel) self.num_warmup = num_warmup self.num_samples = num_samples self.mcmc_seed = mcmc_seed self.confidence_level = confidence_level self.region = region + self.explore_CLR = explore_CLR + self.step_size = step_size self.verbose = verbose def aggregation_fit(self, classif_predictions, labels): @@ -105,10 +109,19 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): def log_prior(prev): return 0 - def sample_neighbour(prev, step_size=0.05): + def sample_neighbour(prev, step_size): # random-walk Metropolis-Hastings - dir_noise = rng.normal(scale=step_size, size=len(prev)) - neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex') + d = len(prev) + if not self.explore_CLR: + dir_noise = rng.normal(scale=step_size/np.sqrt(d), size=d) + neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex') + else: + clr = CLRtransformation() + clr_point = clr(prev) + dir_noise = rng.normal(scale=step_size, size=d) + clr_neighbour = clr_point+dir_noise + neighbour = clr.inverse(clr_neighbour) + assert in_simplex(neighbour), 'wrong CLR transformation' return neighbour n_classes = X_probs.shape[1] @@ -116,9 +129,9 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): current_likelihood = log_likelihood(current_prev) + log_prior(current_prev) # Metropolis-Hastings with adaptive rate - step_size = 0.05 + step_size = self.step_size target_acceptance = 0.3 - adapt_rate = 0.01 + adapt_rate = 0.05 acceptance_history = [] samples = [] @@ -142,7 +155,13 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC): if i < self.num_warmup and i%10==0 and len(acceptance_history)>=100: recent_accept_rate = np.mean(acceptance_history[-100:]) step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance)) + # step_size = float(np.clip(step_size, min_step, max_step)) + print(f'acceptance-rate={recent_accept_rate*100:.3f}%, step-size={step_size:.5f}') # remove "warmup" initial iterations samples = np.asarray(samples[self.num_warmup:]) - return samples \ No newline at end of file + return samples + + +def in_simplex(x): + return np.all(x >= 0) and np.isclose(x.sum(), 1) \ No newline at end of file diff --git a/BayesianKDEy/full_experiments.py b/BayesianKDEy/full_experiments.py index 2109127..2e0c493 100644 --- a/BayesianKDEy/full_experiments.py +++ b/BayesianKDEy/full_experiments.py @@ -9,8 +9,9 @@ from sklearn.model_selection import GridSearchCV, StratifiedKFold from copy import deepcopy as cp import quapy as qp from BayesianKDEy._bayeisan_kdey import BayesianKDEy -from quapy.method.aggregative import DistributionMatchingY as DMy -from quapy.method.base import BinaryQuantifier +from build.lib.quapy.data import LabelledCollection +from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier +from quapy.method.base import BinaryQuantifier, BaseQuantifier from quapy.model_selection import GridSearchQ from quapy.data import Dataset # from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot @@ -24,73 +25,95 @@ from tqdm import tqdm from scipy.stats import dirichlet from collections import defaultdict from time import time -from sklearn.base import clone +from sklearn.base import clone, BaseEstimator -# def new_classifier(training): -# print('optimizing hyperparameters of Logistic Regression') -# mod_sel = GridSearchCV( -# estimator=LogisticRegression(), -# param_grid={ -# 'C': np.logspace(-4, 4, 9), -# 'class_weight': ['balanced', None] -# }, -# cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=0), -# n_jobs=-1, -# refit=False, -# ) -# mod_sel.fit(*training.Xy) -# # optim = LogisticRegression(**mod_sel.best_params_) -# print(f'Done: hyperparameters chosen={mod_sel.best_params_}') -# # calib = CalibratedClassifierCV(optim, cv=10, n_jobs=-1, ensemble=False).fit(*training.Xy) -# # return calib -# return LogisticRegression(**mod_sel.best_params_) +class KDEyCLR(KDEyML): + def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None): + super().__init__( + classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth, + random_state=random_state, kernel='aitchison' + ) -def methods(): +def methods__(): acc_hyper = {} hdy_hyper = {'nbins': [3,4,5,8,16,32]} - kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2], 'classifier__C':[1]} wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()} # yield 'BootstrapACC', AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), wrap_hyper(acc_hyper) # yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), wrap_hyper(hdy_hyper) - # yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper) + yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper) # yield 'BayesianACC', BayesianCC(LR(), mcmc_seed=0), acc_hyper - yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper + # yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper # yield 'BayesianKDEy', BayesianKDEy(LR(), mcmc_seed=0), kdey_hyper -def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict): +def methods(): + """ + Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where: + - name: is a str representing the name of the method (e.g., 'BayesianKDEy') + - quantifier: is the base model (e.g., KDEyML()) + - hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}) + - bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the + quantifier with optimized hyperparameters + """ + acc_hyper = {} + hdy_hyper = {'nbins': [3,4,5,8,16,32]} + kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} + + yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), + yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0) + + yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0), + + yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), + yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), + yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper), + + +def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict): with qp.util.temp_seed(0): + print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}') # model selection - train, test = dataset.train_test - train_prevalence = train.prevalence() if len(grid)>0: train, val = train.split_stratified(train_prop=0.6, random_state=0) mod_sel = GridSearchQ( - model=method, + model=point_quantifier, param_grid=grid, protocol=qp.protocol.UPP(val, repeats=250, random_state=0), - refit=True, + refit=False, n_jobs=-1, verbose=True ).fit(*train.Xy) - optim_quantifier = mod_sel.best_model() best_params = mod_sel.best_params_ - best_score = mod_sel.best_score_ - tr_time = mod_sel.refit_time_ else: - t_init = time() - method.fit(*train.Xy) - tr_time = time() - t_init - best_params, best_score = {}, -1 - optim_quantifier = method + best_params = {} + + return best_params + + +def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method_name:str, grid: dict, withconf_constructor, hyper_choice_path: Path): + with qp.util.temp_seed(0): + + training, test = dataset.train_test + + # model selection + best_hyperparams = qp.util.pickled_resource( + hyper_choice_path, model_selection, training, cp(point_quantifier), grid + ) + + t_init = time() + withconf_quantifier = withconf_constructor(best_hyperparams).fit(*training.Xy) + tr_time = time() - t_init # test + train_prevalence = training.prevalence() results = defaultdict(list) - test_generator = UPP(test, repeats=500, random_state=0) + test_generator = UPP(test, repeats=100, random_state=0) for i, (sample_X, true_prevalence) in tqdm(enumerate(test_generator()), total=test_generator.total(), desc=f'{method_name} predictions'): t_init = time() - point_estimate, region = optim_quantifier.predict_conf(sample_X) + point_estimate, region = withconf_quantifier.predict_conf(sample_X) ttime = time()-t_init results['true-prevs'].append(true_prevalence) results['point-estim'].append(point_estimate) @@ -103,9 +126,8 @@ def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict): results['samples'].append(region.samples) report = { - 'optim_hyper': best_params, - 'optim_score': best_score, - 'refit_time': tr_time, + 'optim_hyper': best_hyperparams, + 'train_time': tr_time, 'train-prev': train_prevalence, 'results': {k:np.asarray(v) for k,v in results.items()} } @@ -134,26 +156,30 @@ if __name__ == '__main__': result_dir = Path('./results') - for setup in [binary, multiclass]: + for setup in [binary, multiclass]: # [binary, multiclass]: qp.environ['SAMPLE_SIZE'] = setup['sample_size'] for data_name in setup['datasets']: print(f'dataset={data_name}') - if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"): - print(f'skipping dataset: {data_name}') - continue + # if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"): + # print(f'skipping dataset: {data_name}') + # continue data = setup['fetch_fn'](data_name) is_binary = data.n_classes==2 result_subdir = result_dir / ('binary' if is_binary else 'multiclass') - for method_name, method, hyper_params in methods(): + hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass') + for method_name, method, hyper_params, withconf_constructor in methods(): if isinstance(method, BinaryQuantifier) and not is_binary: continue result_path = experiment_path(result_subdir, data_name, method_name) - report = qp.util.pickled_resource(result_path, experiment, data, method, hyper_params) + hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__) + report = qp.util.pickled_resource( + result_path, experiment, data, method, method_name, hyper_params, withconf_constructor, hyper_path + ) print(f'dataset={data_name}, ' f'method={method_name}: ' f'mae={report["results"]["ae"].mean():.3f}, ' - f'coverage={report["results"]["coverage"].mean():.3f}, ' - f'amplitude={report["results"]["amplitude"].mean():.3f}, ') + f'coverage={report["results"]["coverage"].mean():.5f}, ' + f'amplitude={report["results"]["amplitude"].mean():.5f}, ') diff --git a/BayesianKDEy/generate_results.py b/BayesianKDEy/generate_results.py index 6d13849..1cd309d 100644 --- a/BayesianKDEy/generate_results.py +++ b/BayesianKDEy/generate_results.py @@ -1,31 +1,112 @@ import pickle from collections import defaultdict +from joblib import Parallel, delayed +from tqdm import tqdm import pandas as pd from glob import glob from pathlib import Path +import quapy as qp +from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR + +pd.set_option('display.max_columns', None) +pd.set_option('display.width', 2000) +pd.set_option('display.max_rows', None) +pd.set_option("display.expand_frame_repr", False) +pd.set_option("display.precision", 4) +pd.set_option("display.float_format", "{:.4f}".format) + + +def compute_coverage_amplitude(region_constructor): + all_samples = results['samples'] + all_true_prevs = results['true-prevs'] + + def process_one(samples, true_prevs): + ellipse = region_constructor(samples) + return ellipse.coverage(true_prevs), ellipse.montecarlo_proportion() + + out = Parallel(n_jobs=3)( + delayed(process_one)(samples, true_prevs) + for samples, true_prevs in tqdm( + zip(all_samples, all_true_prevs), + total=len(all_samples), + desc='constructing ellipses' + ) + ) + + # unzip results + coverage, amplitude = zip(*out) + return list(coverage), list(amplitude) + + +def update_pickle(report, pickle_path, updated_dict:dict): + for k,v in updated_dict.items(): + report[k]=v + pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) + for setup in ['binary', 'multiclass']: path = f'./results/{setup}/*.pkl' table = defaultdict(list) - for file in glob(path): + for file in tqdm(glob(path), desc='processing results', total=len(glob(path))): file = Path(file) dataset, method = file.name.replace('.pkl', '').split('__') report = pickle.load(open(file, 'rb')) results = report['results'] n_samples = len(results['ae']) - table['method'].extend([method] * n_samples) + table['method'].extend([method.replace('Bayesian','Ba').replace('Bootstrap', 'Bo')] * n_samples) table['dataset'].extend([dataset] * n_samples) table['ae'].extend(results['ae']) - table['coverage'].extend(results['coverage']) - table['amplitude'].extend(results['amplitude']) + table['c-CI'].extend(results['coverage']) + table['a-CI'].extend(results['amplitude']) + + if 'coverage-CE' not in report: + covCE, ampCE = compute_coverage_amplitude(ConfidenceEllipseSimplex) + covCLR, ampCLR = compute_coverage_amplitude(ConfidenceEllipseCLR) + + update_fields = { + 'coverage-CE': covCE, + 'amplitude-CE': ampCE, + 'coverage-CLR': covCLR, + 'amplitude-CLR': ampCLR + } + + update_pickle(report, file, update_fields) + + table['c-CE'].extend(report['coverage-CE']) + table['a-CE'].extend(report['amplitude-CE']) + + table['c-CLR'].extend(report['coverage-CLR']) + table['a-CLR'].extend(report['amplitude-CLR']) + - pd.set_option('display.max_columns', None) - pd.set_option('display.width', 1000) - pd.set_option('display.max_rows', None) df = pd.DataFrame(table) - pv = pd.pivot_table(df, index='dataset', columns='method', values=['ae', 'coverage', 'amplitude']) - print(f'{setup=}') - print(pv) - print() + + n_classes = {} + tr_size = {} + for dataset in df['dataset'].unique(): + fetch_fn = { + 'binary': qp.datasets.fetch_UCIBinaryDataset, + 'multiclass': qp.datasets.fetch_UCIMulticlassDataset + }[setup] + data = fetch_fn(dataset) + n_classes[dataset] = data.n_classes + tr_size[dataset] = len(data.training) + + # remove datasets with more than max_classes classes + max_classes = 30 + for data_name, n in n_classes.items(): + if n > max_classes: + df = df[df["dataset"] != data_name] + + for region in ['CI', 'CE', 'CLR']: + pv = pd.pivot_table( + df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True + ) + pv['n_classes'] = pv.index.map(n_classes).astype('Int64') + pv['tr_size'] = pv.index.map(tr_size).astype('Int64') + pv = pv.drop(columns=[col for col in pv.columns if col[-1] == "All"]) + print(f'{setup=}') + print(pv) + print('-'*80) diff --git a/BayesianKDEy/single_experiment_debug.py b/BayesianKDEy/single_experiment_debug.py new file mode 100644 index 0000000..6cbe8a7 --- /dev/null +++ b/BayesianKDEy/single_experiment_debug.py @@ -0,0 +1,95 @@ +import os +import warnings +from os.path import join +from pathlib import Path + +from sklearn.calibration import CalibratedClassifierCV +from sklearn.linear_model import LogisticRegression as LR +from sklearn.model_selection import GridSearchCV, StratifiedKFold +from copy import deepcopy as cp +import quapy as qp +from BayesianKDEy._bayeisan_kdey import BayesianKDEy +from BayesianKDEy.full_experiments import experiment, experiment_path, KDEyCLR +from build.lib.quapy.data import LabelledCollection +from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier +from quapy.method.base import BinaryQuantifier, BaseQuantifier +from quapy.model_selection import GridSearchQ +from quapy.data import Dataset +# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot +from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap +from quapy.functional import strprev +from quapy.method.aggregative import KDEyML, ACC +from quapy.protocol import UPP +import quapy.functional as F +import numpy as np +from tqdm import tqdm +from scipy.stats import dirichlet +from collections import defaultdict +from time import time +from sklearn.base import clone, BaseEstimator + + +def method(): + """ + Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where: + - name: is a str representing the name of the method (e.g., 'BayesianKDEy') + - quantifier: is the base model (e.g., KDEyML()) + - hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}) + - bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the + quantifier with optimized hyperparameters + """ + acc_hyper = {} + hdy_hyper = {'nbins': [3,4,5,8,16,32]} + kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} + kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]} + + wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()} + + # yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True), + # yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper), + return 'BayKDE*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, + explore_CLR=True, + step_size=.15, + # num_warmup = 5000, + # num_samples = 10_000, + # region='ellipse', + **hyper), + + + +if __name__ == '__main__': + + binary = { + 'datasets': qp.datasets.UCI_BINARY_DATASETS, + 'fetch_fn': qp.datasets.fetch_UCIBinaryDataset, + 'sample_size': 500 + } + + multiclass = { + 'datasets': qp.datasets.UCI_MULTICLASS_DATASETS, + 'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset, + 'sample_size': 1000 + } + + result_dir = Path('./results') + + setup = multiclass + qp.environ['SAMPLE_SIZE'] = setup['sample_size'] + data_name = 'digits' + print(f'dataset={data_name}') + data = setup['fetch_fn'](data_name) + is_binary = data.n_classes==2 + hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass') + method_name, method, hyper_params, withconf_constructor = method() + hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__) + report = experiment(data, method, method_name, hyper_params, withconf_constructor, hyper_path) + + print(f'dataset={data_name}, ' + f'method={method_name}: ' + f'mae={report["results"]["ae"].mean():.3f}, ' + f'coverage={report["results"]["coverage"].mean():.5f}, ' + f'amplitude={report["results"]["amplitude"].mean():.5f}, ') + + + + diff --git a/quapy/functional.py b/quapy/functional.py index 408c62a..5f6b915 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -583,8 +583,8 @@ def solve_adjustment( """ Function that tries to solve for :math:`p` the equation :math:`q = M p`, where :math:`q` is the vector of `unadjusted counts` (as estimated, e.g., via classify and count) with :math:`q_i` an estimate of - :math:`P(\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an - estimate of :math:`P(\hat{Y}=y_i|Y=y_j)`. + :math:`P(\\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an + estimate of :math:`P(\\hat{Y}=y_i|Y=y_j)`. :param class_conditional_rates: array of shape `(n_classes, n_classes,)` with entry `(i,j)` being the estimate of :math:`P(\hat{Y}=y_i|Y=y_j)`, that is, the probability that an instance that belongs to class :math:`y_j` diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index 5f80f8d..8475ee7 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -33,7 +33,7 @@ class KDEBase: @classmethod - def _check_bandwidth(cls, bandwidth): + def _check_bandwidth(cls, bandwidth, kernel): """ Checks that the bandwidth parameter is correct @@ -43,8 +43,9 @@ class KDEBase: assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' if isinstance(bandwidth, float): - assert 0 < bandwidth < 1, \ - "the bandwidth for KDEy should be in (0,1), since this method models the unit simplex" + assert kernel!='gaussian' or (0 < bandwidth < 1), \ + ("the bandwidth for a Gaussian kernel in KDEy should be in (0,1), " + "since this method models the unit simplex") return bandwidth @classmethod @@ -166,7 +167,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase): def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian', random_state=None): super().__init__(classifier, fit_classifier, val_split) - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel) self.kernel = self._check_kernel(kernel) self.random_state=random_state @@ -246,7 +247,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase): super().__init__(classifier, fit_classifier, val_split) self.divergence = divergence - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian') self.random_state=random_state self.montecarlo_trials = montecarlo_trials @@ -333,7 +334,7 @@ class KDEyCS(AggregativeSoftQuantifier): def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1): super().__init__(classifier, fit_classifier, val_split) - self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian') def gram_matrix_mix_sum(self, X, Y=None): # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y)) diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index b5ebf7e..c308dfb 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -1,4 +1,5 @@ import numpy as np +from joblib import Parallel, delayed from sklearn.base import BaseEstimator from sklearn.metrics import confusion_matrix @@ -13,6 +14,7 @@ from abc import ABC, abstractmethod from scipy.special import softmax, factorial import copy from functools import lru_cache +from tqdm import tqdm """ This module provides implementation of different types of confidence regions, and the implementation of Bootstrap @@ -399,7 +401,8 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): n_test_samples=500, confidence_level=0.95, region='intervals', - random_state=None): + random_state=None, + verbose=False): assert isinstance(quantifier, AggregativeQuantifier), \ f'base quantifier does not seem to be an instance of {AggregativeQuantifier.__name__}' @@ -416,6 +419,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): self.confidence_level = confidence_level self.region = region self.random_state = random_state + self.verbose = verbose def aggregation_fit(self, classif_predictions, labels): data = LabelledCollection(classif_predictions, labels, classes=self.classes_) @@ -441,6 +445,24 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): prev_mean, self.confidence = self.aggregate_conf(classif_predictions) return prev_mean + def aggregate_conf_sequential__(self, classif_predictions: np.ndarray, confidence_level=None): + if confidence_level is None: + confidence_level = self.confidence_level + + n_samples = classif_predictions.shape[0] + prevs = [] + with qp.util.temp_seed(self.random_state): + for quantifier in self.quantifiers: + for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose): + sample_i = resample(classif_predictions, n_samples=n_samples) + prev_i = quantifier.aggregate(sample_i) + prevs.append(prev_i) + + conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region) + prev_estim = conf.point_estimate() + + return prev_estim, conf + def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None): if confidence_level is None: confidence_level = self.confidence_level @@ -449,10 +471,15 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): prevs = [] with qp.util.temp_seed(self.random_state): for quantifier in self.quantifiers: - for i in range(self.n_test_samples): - sample_i = resample(classif_predictions, n_samples=n_samples) - prev_i = quantifier.aggregate(sample_i) - prevs.append(prev_i) + results = Parallel(n_jobs=-1)( + delayed(bootstrap_once)(i, classif_predictions, quantifier, n_samples) + for i in range(self.n_test_samples) + ) + prevs.extend(results) + # for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose): + # sample_i = resample(classif_predictions, n_samples=n_samples) + # prev_i = quantifier.aggregate(sample_i) + # prevs.append(prev_i) conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region) prev_estim = conf.point_estimate() @@ -477,6 +504,13 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): return self.quantifier._classifier_method() +def bootstrap_once(i, classif_predictions, quantifier, n_samples): + idx = np.random.randint(0, len(classif_predictions), n_samples) + sample = classif_predictions[idx] + prev = quantifier.aggregate(sample) + return prev + + class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): """ `Bayesian quantification `_ method (by Albert Ziegler and Paweł Czyż), diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 0937fa8..c357082 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -410,7 +410,7 @@ def group_params(param_grid: dict): """ classifier_params, quantifier_params = {}, {} for key, values in param_grid.items(): - if key.startswith('classifier__') or key == 'val_split': + if 'classifier__' in key or key == 'val_split': classifier_params[key] = values else: quantifier_params[key] = values