adding exploration in CLR

This commit is contained in:
Alejandro Moreo Fernandez 2025-12-04 10:24:02 +01:00
parent 881e1033f1
commit 23608f2038
9 changed files with 343 additions and 84 deletions

3
BayesianKDEy/TODO.txt Normal file
View File

@ -0,0 +1,3 @@
- Add other methods that natively provide uncertainty quantification methods?
- Explore neighbourhood in the CLR space instead than in the simplex!
-

View File

@ -1,7 +1,7 @@
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
import numpy as np import numpy as np
from quapy.method._kdey import KDEBase from quapy.method._kdey import KDEBase
from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC, CLRtransformation
from quapy.method.aggregative import AggregativeSoftQuantifier from quapy.method.aggregative import AggregativeSoftQuantifier
from tqdm import tqdm from tqdm import tqdm
import quapy.functional as F import quapy.functional as F
@ -40,6 +40,8 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
mcmc_seed: int = 0, mcmc_seed: int = 0,
confidence_level: float = 0.95, confidence_level: float = 0.95,
region: str = 'intervals', region: str = 'intervals',
explore_CLR=False,
step_size=0.05,
verbose: bool = False): verbose: bool = False):
if num_warmup <= 0: if num_warmup <= 0:
@ -48,13 +50,15 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
raise ValueError(f'parameter {num_samples=} must be a positive integer') raise ValueError(f'parameter {num_samples=} must be a positive integer')
super().__init__(classifier, fit_classifier, val_split) super().__init__(classifier, fit_classifier, val_split)
self.bandwidth = KDEBase._check_bandwidth(bandwidth) self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel)
self.kernel = self._check_kernel(kernel) self.kernel = self._check_kernel(kernel)
self.num_warmup = num_warmup self.num_warmup = num_warmup
self.num_samples = num_samples self.num_samples = num_samples
self.mcmc_seed = mcmc_seed self.mcmc_seed = mcmc_seed
self.confidence_level = confidence_level self.confidence_level = confidence_level
self.region = region self.region = region
self.explore_CLR = explore_CLR
self.step_size = step_size
self.verbose = verbose self.verbose = verbose
def aggregation_fit(self, classif_predictions, labels): def aggregation_fit(self, classif_predictions, labels):
@ -105,10 +109,19 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
def log_prior(prev): def log_prior(prev):
return 0 return 0
def sample_neighbour(prev, step_size=0.05): def sample_neighbour(prev, step_size):
# random-walk Metropolis-Hastings # random-walk Metropolis-Hastings
dir_noise = rng.normal(scale=step_size, size=len(prev)) d = len(prev)
if not self.explore_CLR:
dir_noise = rng.normal(scale=step_size/np.sqrt(d), size=d)
neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex') neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex')
else:
clr = CLRtransformation()
clr_point = clr(prev)
dir_noise = rng.normal(scale=step_size, size=d)
clr_neighbour = clr_point+dir_noise
neighbour = clr.inverse(clr_neighbour)
assert in_simplex(neighbour), 'wrong CLR transformation'
return neighbour return neighbour
n_classes = X_probs.shape[1] n_classes = X_probs.shape[1]
@ -116,9 +129,9 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
current_likelihood = log_likelihood(current_prev) + log_prior(current_prev) current_likelihood = log_likelihood(current_prev) + log_prior(current_prev)
# Metropolis-Hastings with adaptive rate # Metropolis-Hastings with adaptive rate
step_size = 0.05 step_size = self.step_size
target_acceptance = 0.3 target_acceptance = 0.3
adapt_rate = 0.01 adapt_rate = 0.05
acceptance_history = [] acceptance_history = []
samples = [] samples = []
@ -142,7 +155,13 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
if i < self.num_warmup and i%10==0 and len(acceptance_history)>=100: if i < self.num_warmup and i%10==0 and len(acceptance_history)>=100:
recent_accept_rate = np.mean(acceptance_history[-100:]) recent_accept_rate = np.mean(acceptance_history[-100:])
step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance)) step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance))
# step_size = float(np.clip(step_size, min_step, max_step))
print(f'acceptance-rate={recent_accept_rate*100:.3f}%, step-size={step_size:.5f}')
# remove "warmup" initial iterations # remove "warmup" initial iterations
samples = np.asarray(samples[self.num_warmup:]) samples = np.asarray(samples[self.num_warmup:])
return samples return samples
def in_simplex(x):
return np.all(x >= 0) and np.isclose(x.sum(), 1)

View File

@ -9,8 +9,9 @@ from sklearn.model_selection import GridSearchCV, StratifiedKFold
from copy import deepcopy as cp from copy import deepcopy as cp
import quapy as qp import quapy as qp
from BayesianKDEy._bayeisan_kdey import BayesianKDEy from BayesianKDEy._bayeisan_kdey import BayesianKDEy
from quapy.method.aggregative import DistributionMatchingY as DMy from build.lib.quapy.data import LabelledCollection
from quapy.method.base import BinaryQuantifier from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier
from quapy.method.base import BinaryQuantifier, BaseQuantifier
from quapy.model_selection import GridSearchQ from quapy.model_selection import GridSearchQ
from quapy.data import Dataset from quapy.data import Dataset
# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot # from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
@ -24,73 +25,95 @@ from tqdm import tqdm
from scipy.stats import dirichlet from scipy.stats import dirichlet
from collections import defaultdict from collections import defaultdict
from time import time from time import time
from sklearn.base import clone from sklearn.base import clone, BaseEstimator
# def new_classifier(training): class KDEyCLR(KDEyML):
# print('optimizing hyperparameters of Logistic Regression') def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
# mod_sel = GridSearchCV( super().__init__(
# estimator=LogisticRegression(), classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
# param_grid={ random_state=random_state, kernel='aitchison'
# 'C': np.logspace(-4, 4, 9), )
# 'class_weight': ['balanced', None]
# },
# cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=0),
# n_jobs=-1,
# refit=False,
# )
# mod_sel.fit(*training.Xy)
# # optim = LogisticRegression(**mod_sel.best_params_)
# print(f'Done: hyperparameters chosen={mod_sel.best_params_}')
# # calib = CalibratedClassifierCV(optim, cv=10, n_jobs=-1, ensemble=False).fit(*training.Xy)
# # return calib
# return LogisticRegression(**mod_sel.best_params_)
def methods(): def methods__():
acc_hyper = {} acc_hyper = {}
hdy_hyper = {'nbins': [3,4,5,8,16,32]} hdy_hyper = {'nbins': [3,4,5,8,16,32]}
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]} kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2], 'classifier__C':[1]}
wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()} wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()}
# yield 'BootstrapACC', AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), wrap_hyper(acc_hyper) # yield 'BootstrapACC', AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), wrap_hyper(acc_hyper)
# yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), wrap_hyper(hdy_hyper) # yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), wrap_hyper(hdy_hyper)
# yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper) yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper)
# yield 'BayesianACC', BayesianCC(LR(), mcmc_seed=0), acc_hyper # yield 'BayesianACC', BayesianCC(LR(), mcmc_seed=0), acc_hyper
yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper # yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper
# yield 'BayesianKDEy', BayesianKDEy(LR(), mcmc_seed=0), kdey_hyper # yield 'BayesianKDEy', BayesianKDEy(LR(), mcmc_seed=0), kdey_hyper
def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict): def methods():
"""
Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
- name: is a str representing the name of the method (e.g., 'BayesianKDEy')
- quantifier: is the base model (e.g., KDEyML())
- hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]})
- bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
quantifier with optimized hyperparameters
"""
acc_hyper = {}
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0),
yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0)
yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0),
yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True),
yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper),
yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper),
def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict):
with qp.util.temp_seed(0): with qp.util.temp_seed(0):
print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}')
# model selection # model selection
train, test = dataset.train_test
train_prevalence = train.prevalence()
if len(grid)>0: if len(grid)>0:
train, val = train.split_stratified(train_prop=0.6, random_state=0) train, val = train.split_stratified(train_prop=0.6, random_state=0)
mod_sel = GridSearchQ( mod_sel = GridSearchQ(
model=method, model=point_quantifier,
param_grid=grid, param_grid=grid,
protocol=qp.protocol.UPP(val, repeats=250, random_state=0), protocol=qp.protocol.UPP(val, repeats=250, random_state=0),
refit=True, refit=False,
n_jobs=-1, n_jobs=-1,
verbose=True verbose=True
).fit(*train.Xy) ).fit(*train.Xy)
optim_quantifier = mod_sel.best_model()
best_params = mod_sel.best_params_ best_params = mod_sel.best_params_
best_score = mod_sel.best_score_
tr_time = mod_sel.refit_time_
else: else:
best_params = {}
return best_params
def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method_name:str, grid: dict, withconf_constructor, hyper_choice_path: Path):
with qp.util.temp_seed(0):
training, test = dataset.train_test
# model selection
best_hyperparams = qp.util.pickled_resource(
hyper_choice_path, model_selection, training, cp(point_quantifier), grid
)
t_init = time() t_init = time()
method.fit(*train.Xy) withconf_quantifier = withconf_constructor(best_hyperparams).fit(*training.Xy)
tr_time = time() - t_init tr_time = time() - t_init
best_params, best_score = {}, -1
optim_quantifier = method
# test # test
train_prevalence = training.prevalence()
results = defaultdict(list) results = defaultdict(list)
test_generator = UPP(test, repeats=500, random_state=0) test_generator = UPP(test, repeats=100, random_state=0)
for i, (sample_X, true_prevalence) in tqdm(enumerate(test_generator()), total=test_generator.total(), desc=f'{method_name} predictions'): for i, (sample_X, true_prevalence) in tqdm(enumerate(test_generator()), total=test_generator.total(), desc=f'{method_name} predictions'):
t_init = time() t_init = time()
point_estimate, region = optim_quantifier.predict_conf(sample_X) point_estimate, region = withconf_quantifier.predict_conf(sample_X)
ttime = time()-t_init ttime = time()-t_init
results['true-prevs'].append(true_prevalence) results['true-prevs'].append(true_prevalence)
results['point-estim'].append(point_estimate) results['point-estim'].append(point_estimate)
@ -103,9 +126,8 @@ def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict):
results['samples'].append(region.samples) results['samples'].append(region.samples)
report = { report = {
'optim_hyper': best_params, 'optim_hyper': best_hyperparams,
'optim_score': best_score, 'train_time': tr_time,
'refit_time': tr_time,
'train-prev': train_prevalence, 'train-prev': train_prevalence,
'results': {k:np.asarray(v) for k,v in results.items()} 'results': {k:np.asarray(v) for k,v in results.items()}
} }
@ -134,26 +156,30 @@ if __name__ == '__main__':
result_dir = Path('./results') result_dir = Path('./results')
for setup in [binary, multiclass]: for setup in [binary, multiclass]: # [binary, multiclass]:
qp.environ['SAMPLE_SIZE'] = setup['sample_size'] qp.environ['SAMPLE_SIZE'] = setup['sample_size']
for data_name in setup['datasets']: for data_name in setup['datasets']:
print(f'dataset={data_name}') print(f'dataset={data_name}')
if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"): # if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"):
print(f'skipping dataset: {data_name}') # print(f'skipping dataset: {data_name}')
continue # continue
data = setup['fetch_fn'](data_name) data = setup['fetch_fn'](data_name)
is_binary = data.n_classes==2 is_binary = data.n_classes==2
result_subdir = result_dir / ('binary' if is_binary else 'multiclass') result_subdir = result_dir / ('binary' if is_binary else 'multiclass')
for method_name, method, hyper_params in methods(): hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass')
for method_name, method, hyper_params, withconf_constructor in methods():
if isinstance(method, BinaryQuantifier) and not is_binary: if isinstance(method, BinaryQuantifier) and not is_binary:
continue continue
result_path = experiment_path(result_subdir, data_name, method_name) result_path = experiment_path(result_subdir, data_name, method_name)
report = qp.util.pickled_resource(result_path, experiment, data, method, hyper_params) hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__)
report = qp.util.pickled_resource(
result_path, experiment, data, method, method_name, hyper_params, withconf_constructor, hyper_path
)
print(f'dataset={data_name}, ' print(f'dataset={data_name}, '
f'method={method_name}: ' f'method={method_name}: '
f'mae={report["results"]["ae"].mean():.3f}, ' f'mae={report["results"]["ae"].mean():.3f}, '
f'coverage={report["results"]["coverage"].mean():.3f}, ' f'coverage={report["results"]["coverage"].mean():.5f}, '
f'amplitude={report["results"]["amplitude"].mean():.3f}, ') f'amplitude={report["results"]["amplitude"].mean():.5f}, ')

View File

@ -1,31 +1,112 @@
import pickle import pickle
from collections import defaultdict from collections import defaultdict
from joblib import Parallel, delayed
from tqdm import tqdm
import pandas as pd import pandas as pd
from glob import glob from glob import glob
from pathlib import Path from pathlib import Path
import quapy as qp
from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.precision", 4)
pd.set_option("display.float_format", "{:.4f}".format)
def compute_coverage_amplitude(region_constructor):
all_samples = results['samples']
all_true_prevs = results['true-prevs']
def process_one(samples, true_prevs):
ellipse = region_constructor(samples)
return ellipse.coverage(true_prevs), ellipse.montecarlo_proportion()
out = Parallel(n_jobs=3)(
delayed(process_one)(samples, true_prevs)
for samples, true_prevs in tqdm(
zip(all_samples, all_true_prevs),
total=len(all_samples),
desc='constructing ellipses'
)
)
# unzip results
coverage, amplitude = zip(*out)
return list(coverage), list(amplitude)
def update_pickle(report, pickle_path, updated_dict:dict):
for k,v in updated_dict.items():
report[k]=v
pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
for setup in ['binary', 'multiclass']: for setup in ['binary', 'multiclass']:
path = f'./results/{setup}/*.pkl' path = f'./results/{setup}/*.pkl'
table = defaultdict(list) table = defaultdict(list)
for file in glob(path): for file in tqdm(glob(path), desc='processing results', total=len(glob(path))):
file = Path(file) file = Path(file)
dataset, method = file.name.replace('.pkl', '').split('__') dataset, method = file.name.replace('.pkl', '').split('__')
report = pickle.load(open(file, 'rb')) report = pickle.load(open(file, 'rb'))
results = report['results'] results = report['results']
n_samples = len(results['ae']) n_samples = len(results['ae'])
table['method'].extend([method] * n_samples) table['method'].extend([method.replace('Bayesian','Ba').replace('Bootstrap', 'Bo')] * n_samples)
table['dataset'].extend([dataset] * n_samples) table['dataset'].extend([dataset] * n_samples)
table['ae'].extend(results['ae']) table['ae'].extend(results['ae'])
table['coverage'].extend(results['coverage']) table['c-CI'].extend(results['coverage'])
table['amplitude'].extend(results['amplitude']) table['a-CI'].extend(results['amplitude'])
if 'coverage-CE' not in report:
covCE, ampCE = compute_coverage_amplitude(ConfidenceEllipseSimplex)
covCLR, ampCLR = compute_coverage_amplitude(ConfidenceEllipseCLR)
update_fields = {
'coverage-CE': covCE,
'amplitude-CE': ampCE,
'coverage-CLR': covCLR,
'amplitude-CLR': ampCLR
}
update_pickle(report, file, update_fields)
table['c-CE'].extend(report['coverage-CE'])
table['a-CE'].extend(report['amplitude-CE'])
table['c-CLR'].extend(report['coverage-CLR'])
table['a-CLR'].extend(report['amplitude-CLR'])
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', None)
df = pd.DataFrame(table) df = pd.DataFrame(table)
pv = pd.pivot_table(df, index='dataset', columns='method', values=['ae', 'coverage', 'amplitude'])
n_classes = {}
tr_size = {}
for dataset in df['dataset'].unique():
fetch_fn = {
'binary': qp.datasets.fetch_UCIBinaryDataset,
'multiclass': qp.datasets.fetch_UCIMulticlassDataset
}[setup]
data = fetch_fn(dataset)
n_classes[dataset] = data.n_classes
tr_size[dataset] = len(data.training)
# remove datasets with more than max_classes classes
max_classes = 30
for data_name, n in n_classes.items():
if n > max_classes:
df = df[df["dataset"] != data_name]
for region in ['CI', 'CE', 'CLR']:
pv = pd.pivot_table(
df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True
)
pv['n_classes'] = pv.index.map(n_classes).astype('Int64')
pv['tr_size'] = pv.index.map(tr_size).astype('Int64')
pv = pv.drop(columns=[col for col in pv.columns if col[-1] == "All"])
print(f'{setup=}') print(f'{setup=}')
print(pv) print(pv)
print() print('-'*80)

View File

@ -0,0 +1,95 @@
import os
import warnings
from os.path import join
from pathlib import Path
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from copy import deepcopy as cp
import quapy as qp
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
from BayesianKDEy.full_experiments import experiment, experiment_path, KDEyCLR
from build.lib.quapy.data import LabelledCollection
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier
from quapy.method.base import BinaryQuantifier, BaseQuantifier
from quapy.model_selection import GridSearchQ
from quapy.data import Dataset
# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap
from quapy.functional import strprev
from quapy.method.aggregative import KDEyML, ACC
from quapy.protocol import UPP
import quapy.functional as F
import numpy as np
from tqdm import tqdm
from scipy.stats import dirichlet
from collections import defaultdict
from time import time
from sklearn.base import clone, BaseEstimator
def method():
"""
Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
- name: is a str representing the name of the method (e.g., 'BayesianKDEy')
- quantifier: is the base model (e.g., KDEyML())
- hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]})
- bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
quantifier with optimized hyperparameters
"""
acc_hyper = {}
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()}
# yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True),
# yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper),
return 'BayKDE*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0,
explore_CLR=True,
step_size=.15,
# num_warmup = 5000,
# num_samples = 10_000,
# region='ellipse',
**hyper),
if __name__ == '__main__':
binary = {
'datasets': qp.datasets.UCI_BINARY_DATASETS,
'fetch_fn': qp.datasets.fetch_UCIBinaryDataset,
'sample_size': 500
}
multiclass = {
'datasets': qp.datasets.UCI_MULTICLASS_DATASETS,
'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset,
'sample_size': 1000
}
result_dir = Path('./results')
setup = multiclass
qp.environ['SAMPLE_SIZE'] = setup['sample_size']
data_name = 'digits'
print(f'dataset={data_name}')
data = setup['fetch_fn'](data_name)
is_binary = data.n_classes==2
hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass')
method_name, method, hyper_params, withconf_constructor = method()
hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__)
report = experiment(data, method, method_name, hyper_params, withconf_constructor, hyper_path)
print(f'dataset={data_name}, '
f'method={method_name}: '
f'mae={report["results"]["ae"].mean():.3f}, '
f'coverage={report["results"]["coverage"].mean():.5f}, '
f'amplitude={report["results"]["amplitude"].mean():.5f}, ')

View File

@ -583,8 +583,8 @@ def solve_adjustment(
""" """
Function that tries to solve for :math:`p` the equation :math:`q = M p`, where :math:`q` is the vector of Function that tries to solve for :math:`p` the equation :math:`q = M p`, where :math:`q` is the vector of
`unadjusted counts` (as estimated, e.g., via classify and count) with :math:`q_i` an estimate of `unadjusted counts` (as estimated, e.g., via classify and count) with :math:`q_i` an estimate of
:math:`P(\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an :math:`P(\\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an
estimate of :math:`P(\hat{Y}=y_i|Y=y_j)`. estimate of :math:`P(\\hat{Y}=y_i|Y=y_j)`.
:param class_conditional_rates: array of shape `(n_classes, n_classes,)` with entry `(i,j)` being the estimate :param class_conditional_rates: array of shape `(n_classes, n_classes,)` with entry `(i,j)` being the estimate
of :math:`P(\hat{Y}=y_i|Y=y_j)`, that is, the probability that an instance that belongs to class :math:`y_j` of :math:`P(\hat{Y}=y_i|Y=y_j)`, that is, the probability that an instance that belongs to class :math:`y_j`

View File

@ -33,7 +33,7 @@ class KDEBase:
@classmethod @classmethod
def _check_bandwidth(cls, bandwidth): def _check_bandwidth(cls, bandwidth, kernel):
""" """
Checks that the bandwidth parameter is correct Checks that the bandwidth parameter is correct
@ -43,8 +43,9 @@ class KDEBase:
assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
if isinstance(bandwidth, float): if isinstance(bandwidth, float):
assert 0 < bandwidth < 1, \ assert kernel!='gaussian' or (0 < bandwidth < 1), \
"the bandwidth for KDEy should be in (0,1), since this method models the unit simplex" ("the bandwidth for a Gaussian kernel in KDEy should be in (0,1), "
"since this method models the unit simplex")
return bandwidth return bandwidth
@classmethod @classmethod
@ -166,7 +167,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian', def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian',
random_state=None): random_state=None):
super().__init__(classifier, fit_classifier, val_split) super().__init__(classifier, fit_classifier, val_split)
self.bandwidth = KDEBase._check_bandwidth(bandwidth) self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel)
self.kernel = self._check_kernel(kernel) self.kernel = self._check_kernel(kernel)
self.random_state=random_state self.random_state=random_state
@ -246,7 +247,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
super().__init__(classifier, fit_classifier, val_split) super().__init__(classifier, fit_classifier, val_split)
self.divergence = divergence self.divergence = divergence
self.bandwidth = KDEBase._check_bandwidth(bandwidth) self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian')
self.random_state=random_state self.random_state=random_state
self.montecarlo_trials = montecarlo_trials self.montecarlo_trials = montecarlo_trials
@ -333,7 +334,7 @@ class KDEyCS(AggregativeSoftQuantifier):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1): def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1):
super().__init__(classifier, fit_classifier, val_split) super().__init__(classifier, fit_classifier, val_split)
self.bandwidth = KDEBase._check_bandwidth(bandwidth) self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian')
def gram_matrix_mix_sum(self, X, Y=None): def gram_matrix_mix_sum(self, X, Y=None):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y)) # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))

View File

@ -1,4 +1,5 @@
import numpy as np import numpy as np
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
@ -13,6 +14,7 @@ from abc import ABC, abstractmethod
from scipy.special import softmax, factorial from scipy.special import softmax, factorial
import copy import copy
from functools import lru_cache from functools import lru_cache
from tqdm import tqdm
""" """
This module provides implementation of different types of confidence regions, and the implementation of Bootstrap This module provides implementation of different types of confidence regions, and the implementation of Bootstrap
@ -399,7 +401,8 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
n_test_samples=500, n_test_samples=500,
confidence_level=0.95, confidence_level=0.95,
region='intervals', region='intervals',
random_state=None): random_state=None,
verbose=False):
assert isinstance(quantifier, AggregativeQuantifier), \ assert isinstance(quantifier, AggregativeQuantifier), \
f'base quantifier does not seem to be an instance of {AggregativeQuantifier.__name__}' f'base quantifier does not seem to be an instance of {AggregativeQuantifier.__name__}'
@ -416,6 +419,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
self.confidence_level = confidence_level self.confidence_level = confidence_level
self.region = region self.region = region
self.random_state = random_state self.random_state = random_state
self.verbose = verbose
def aggregation_fit(self, classif_predictions, labels): def aggregation_fit(self, classif_predictions, labels):
data = LabelledCollection(classif_predictions, labels, classes=self.classes_) data = LabelledCollection(classif_predictions, labels, classes=self.classes_)
@ -441,6 +445,24 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
prev_mean, self.confidence = self.aggregate_conf(classif_predictions) prev_mean, self.confidence = self.aggregate_conf(classif_predictions)
return prev_mean return prev_mean
def aggregate_conf_sequential__(self, classif_predictions: np.ndarray, confidence_level=None):
if confidence_level is None:
confidence_level = self.confidence_level
n_samples = classif_predictions.shape[0]
prevs = []
with qp.util.temp_seed(self.random_state):
for quantifier in self.quantifiers:
for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose):
sample_i = resample(classif_predictions, n_samples=n_samples)
prev_i = quantifier.aggregate(sample_i)
prevs.append(prev_i)
conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region)
prev_estim = conf.point_estimate()
return prev_estim, conf
def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None): def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None):
if confidence_level is None: if confidence_level is None:
confidence_level = self.confidence_level confidence_level = self.confidence_level
@ -449,10 +471,15 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
prevs = [] prevs = []
with qp.util.temp_seed(self.random_state): with qp.util.temp_seed(self.random_state):
for quantifier in self.quantifiers: for quantifier in self.quantifiers:
for i in range(self.n_test_samples): results = Parallel(n_jobs=-1)(
sample_i = resample(classif_predictions, n_samples=n_samples) delayed(bootstrap_once)(i, classif_predictions, quantifier, n_samples)
prev_i = quantifier.aggregate(sample_i) for i in range(self.n_test_samples)
prevs.append(prev_i) )
prevs.extend(results)
# for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose):
# sample_i = resample(classif_predictions, n_samples=n_samples)
# prev_i = quantifier.aggregate(sample_i)
# prevs.append(prev_i)
conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region) conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region)
prev_estim = conf.point_estimate() prev_estim = conf.point_estimate()
@ -477,6 +504,13 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
return self.quantifier._classifier_method() return self.quantifier._classifier_method()
def bootstrap_once(i, classif_predictions, quantifier, n_samples):
idx = np.random.randint(0, len(classif_predictions), n_samples)
sample = classif_predictions[idx]
prev = quantifier.aggregate(sample)
return prev
class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
""" """
`Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ method (by Albert Ziegler and Paweł Czyż), `Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ method (by Albert Ziegler and Paweł Czyż),

View File

@ -410,7 +410,7 @@ def group_params(param_grid: dict):
""" """
classifier_params, quantifier_params = {}, {} classifier_params, quantifier_params = {}, {}
for key, values in param_grid.items(): for key, values in param_grid.items():
if key.startswith('classifier__') or key == 'val_split': if 'classifier__' in key or key == 'val_split':
classifier_params[key] = values classifier_params[key] = values
else: else:
quantifier_params[key] = values quantifier_params[key] = values