adding exploration in CLR

This commit is contained in:
Alejandro Moreo Fernandez 2025-12-04 10:24:02 +01:00
parent 881e1033f1
commit 23608f2038
9 changed files with 343 additions and 84 deletions

3
BayesianKDEy/TODO.txt Normal file
View File

@ -0,0 +1,3 @@
- Add other methods that natively provide uncertainty quantification methods?
- Explore neighbourhood in the CLR space instead than in the simplex!
-

View File

@ -1,7 +1,7 @@
from sklearn.base import BaseEstimator
import numpy as np
from quapy.method._kdey import KDEBase
from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC
from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC, CLRtransformation
from quapy.method.aggregative import AggregativeSoftQuantifier
from tqdm import tqdm
import quapy.functional as F
@ -40,6 +40,8 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
mcmc_seed: int = 0,
confidence_level: float = 0.95,
region: str = 'intervals',
explore_CLR=False,
step_size=0.05,
verbose: bool = False):
if num_warmup <= 0:
@ -48,13 +50,15 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
raise ValueError(f'parameter {num_samples=} must be a positive integer')
super().__init__(classifier, fit_classifier, val_split)
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel)
self.kernel = self._check_kernel(kernel)
self.num_warmup = num_warmup
self.num_samples = num_samples
self.mcmc_seed = mcmc_seed
self.confidence_level = confidence_level
self.region = region
self.explore_CLR = explore_CLR
self.step_size = step_size
self.verbose = verbose
def aggregation_fit(self, classif_predictions, labels):
@ -105,10 +109,19 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
def log_prior(prev):
return 0
def sample_neighbour(prev, step_size=0.05):
def sample_neighbour(prev, step_size):
# random-walk Metropolis-Hastings
dir_noise = rng.normal(scale=step_size, size=len(prev))
neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex')
d = len(prev)
if not self.explore_CLR:
dir_noise = rng.normal(scale=step_size/np.sqrt(d), size=d)
neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex')
else:
clr = CLRtransformation()
clr_point = clr(prev)
dir_noise = rng.normal(scale=step_size, size=d)
clr_neighbour = clr_point+dir_noise
neighbour = clr.inverse(clr_neighbour)
assert in_simplex(neighbour), 'wrong CLR transformation'
return neighbour
n_classes = X_probs.shape[1]
@ -116,9 +129,9 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
current_likelihood = log_likelihood(current_prev) + log_prior(current_prev)
# Metropolis-Hastings with adaptive rate
step_size = 0.05
step_size = self.step_size
target_acceptance = 0.3
adapt_rate = 0.01
adapt_rate = 0.05
acceptance_history = []
samples = []
@ -142,7 +155,13 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
if i < self.num_warmup and i%10==0 and len(acceptance_history)>=100:
recent_accept_rate = np.mean(acceptance_history[-100:])
step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance))
# step_size = float(np.clip(step_size, min_step, max_step))
print(f'acceptance-rate={recent_accept_rate*100:.3f}%, step-size={step_size:.5f}')
# remove "warmup" initial iterations
samples = np.asarray(samples[self.num_warmup:])
return samples
return samples
def in_simplex(x):
return np.all(x >= 0) and np.isclose(x.sum(), 1)

View File

@ -9,8 +9,9 @@ from sklearn.model_selection import GridSearchCV, StratifiedKFold
from copy import deepcopy as cp
import quapy as qp
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
from quapy.method.aggregative import DistributionMatchingY as DMy
from quapy.method.base import BinaryQuantifier
from build.lib.quapy.data import LabelledCollection
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier
from quapy.method.base import BinaryQuantifier, BaseQuantifier
from quapy.model_selection import GridSearchQ
from quapy.data import Dataset
# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
@ -24,73 +25,95 @@ from tqdm import tqdm
from scipy.stats import dirichlet
from collections import defaultdict
from time import time
from sklearn.base import clone
from sklearn.base import clone, BaseEstimator
# def new_classifier(training):
# print('optimizing hyperparameters of Logistic Regression')
# mod_sel = GridSearchCV(
# estimator=LogisticRegression(),
# param_grid={
# 'C': np.logspace(-4, 4, 9),
# 'class_weight': ['balanced', None]
# },
# cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=0),
# n_jobs=-1,
# refit=False,
# )
# mod_sel.fit(*training.Xy)
# # optim = LogisticRegression(**mod_sel.best_params_)
# print(f'Done: hyperparameters chosen={mod_sel.best_params_}')
# # calib = CalibratedClassifierCV(optim, cv=10, n_jobs=-1, ensemble=False).fit(*training.Xy)
# # return calib
# return LogisticRegression(**mod_sel.best_params_)
class KDEyCLR(KDEyML):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
super().__init__(
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
random_state=random_state, kernel='aitchison'
)
def methods():
def methods__():
acc_hyper = {}
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2], 'classifier__C':[1]}
wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()}
# yield 'BootstrapACC', AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), wrap_hyper(acc_hyper)
# yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), wrap_hyper(hdy_hyper)
# yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper)
yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper)
# yield 'BayesianACC', BayesianCC(LR(), mcmc_seed=0), acc_hyper
yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper
# yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper
# yield 'BayesianKDEy', BayesianKDEy(LR(), mcmc_seed=0), kdey_hyper
def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict):
def methods():
"""
Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
- name: is a str representing the name of the method (e.g., 'BayesianKDEy')
- quantifier: is the base model (e.g., KDEyML())
- hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]})
- bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
quantifier with optimized hyperparameters
"""
acc_hyper = {}
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0),
yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0)
yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0),
yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True),
yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper),
yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper),
def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict):
with qp.util.temp_seed(0):
print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}')
# model selection
train, test = dataset.train_test
train_prevalence = train.prevalence()
if len(grid)>0:
train, val = train.split_stratified(train_prop=0.6, random_state=0)
mod_sel = GridSearchQ(
model=method,
model=point_quantifier,
param_grid=grid,
protocol=qp.protocol.UPP(val, repeats=250, random_state=0),
refit=True,
refit=False,
n_jobs=-1,
verbose=True
).fit(*train.Xy)
optim_quantifier = mod_sel.best_model()
best_params = mod_sel.best_params_
best_score = mod_sel.best_score_
tr_time = mod_sel.refit_time_
else:
t_init = time()
method.fit(*train.Xy)
tr_time = time() - t_init
best_params, best_score = {}, -1
optim_quantifier = method
best_params = {}
return best_params
def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method_name:str, grid: dict, withconf_constructor, hyper_choice_path: Path):
with qp.util.temp_seed(0):
training, test = dataset.train_test
# model selection
best_hyperparams = qp.util.pickled_resource(
hyper_choice_path, model_selection, training, cp(point_quantifier), grid
)
t_init = time()
withconf_quantifier = withconf_constructor(best_hyperparams).fit(*training.Xy)
tr_time = time() - t_init
# test
train_prevalence = training.prevalence()
results = defaultdict(list)
test_generator = UPP(test, repeats=500, random_state=0)
test_generator = UPP(test, repeats=100, random_state=0)
for i, (sample_X, true_prevalence) in tqdm(enumerate(test_generator()), total=test_generator.total(), desc=f'{method_name} predictions'):
t_init = time()
point_estimate, region = optim_quantifier.predict_conf(sample_X)
point_estimate, region = withconf_quantifier.predict_conf(sample_X)
ttime = time()-t_init
results['true-prevs'].append(true_prevalence)
results['point-estim'].append(point_estimate)
@ -103,9 +126,8 @@ def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict):
results['samples'].append(region.samples)
report = {
'optim_hyper': best_params,
'optim_score': best_score,
'refit_time': tr_time,
'optim_hyper': best_hyperparams,
'train_time': tr_time,
'train-prev': train_prevalence,
'results': {k:np.asarray(v) for k,v in results.items()}
}
@ -134,26 +156,30 @@ if __name__ == '__main__':
result_dir = Path('./results')
for setup in [binary, multiclass]:
for setup in [binary, multiclass]: # [binary, multiclass]:
qp.environ['SAMPLE_SIZE'] = setup['sample_size']
for data_name in setup['datasets']:
print(f'dataset={data_name}')
if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"):
print(f'skipping dataset: {data_name}')
continue
# if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"):
# print(f'skipping dataset: {data_name}')
# continue
data = setup['fetch_fn'](data_name)
is_binary = data.n_classes==2
result_subdir = result_dir / ('binary' if is_binary else 'multiclass')
for method_name, method, hyper_params in methods():
hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass')
for method_name, method, hyper_params, withconf_constructor in methods():
if isinstance(method, BinaryQuantifier) and not is_binary:
continue
result_path = experiment_path(result_subdir, data_name, method_name)
report = qp.util.pickled_resource(result_path, experiment, data, method, hyper_params)
hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__)
report = qp.util.pickled_resource(
result_path, experiment, data, method, method_name, hyper_params, withconf_constructor, hyper_path
)
print(f'dataset={data_name}, '
f'method={method_name}: '
f'mae={report["results"]["ae"].mean():.3f}, '
f'coverage={report["results"]["coverage"].mean():.3f}, '
f'amplitude={report["results"]["amplitude"].mean():.3f}, ')
f'coverage={report["results"]["coverage"].mean():.5f}, '
f'amplitude={report["results"]["amplitude"].mean():.5f}, ')

View File

@ -1,31 +1,112 @@
import pickle
from collections import defaultdict
from joblib import Parallel, delayed
from tqdm import tqdm
import pandas as pd
from glob import glob
from pathlib import Path
import quapy as qp
from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.precision", 4)
pd.set_option("display.float_format", "{:.4f}".format)
def compute_coverage_amplitude(region_constructor):
all_samples = results['samples']
all_true_prevs = results['true-prevs']
def process_one(samples, true_prevs):
ellipse = region_constructor(samples)
return ellipse.coverage(true_prevs), ellipse.montecarlo_proportion()
out = Parallel(n_jobs=3)(
delayed(process_one)(samples, true_prevs)
for samples, true_prevs in tqdm(
zip(all_samples, all_true_prevs),
total=len(all_samples),
desc='constructing ellipses'
)
)
# unzip results
coverage, amplitude = zip(*out)
return list(coverage), list(amplitude)
def update_pickle(report, pickle_path, updated_dict:dict):
for k,v in updated_dict.items():
report[k]=v
pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
for setup in ['binary', 'multiclass']:
path = f'./results/{setup}/*.pkl'
table = defaultdict(list)
for file in glob(path):
for file in tqdm(glob(path), desc='processing results', total=len(glob(path))):
file = Path(file)
dataset, method = file.name.replace('.pkl', '').split('__')
report = pickle.load(open(file, 'rb'))
results = report['results']
n_samples = len(results['ae'])
table['method'].extend([method] * n_samples)
table['method'].extend([method.replace('Bayesian','Ba').replace('Bootstrap', 'Bo')] * n_samples)
table['dataset'].extend([dataset] * n_samples)
table['ae'].extend(results['ae'])
table['coverage'].extend(results['coverage'])
table['amplitude'].extend(results['amplitude'])
table['c-CI'].extend(results['coverage'])
table['a-CI'].extend(results['amplitude'])
if 'coverage-CE' not in report:
covCE, ampCE = compute_coverage_amplitude(ConfidenceEllipseSimplex)
covCLR, ampCLR = compute_coverage_amplitude(ConfidenceEllipseCLR)
update_fields = {
'coverage-CE': covCE,
'amplitude-CE': ampCE,
'coverage-CLR': covCLR,
'amplitude-CLR': ampCLR
}
update_pickle(report, file, update_fields)
table['c-CE'].extend(report['coverage-CE'])
table['a-CE'].extend(report['amplitude-CE'])
table['c-CLR'].extend(report['coverage-CLR'])
table['a-CLR'].extend(report['amplitude-CLR'])
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', None)
df = pd.DataFrame(table)
pv = pd.pivot_table(df, index='dataset', columns='method', values=['ae', 'coverage', 'amplitude'])
print(f'{setup=}')
print(pv)
print()
n_classes = {}
tr_size = {}
for dataset in df['dataset'].unique():
fetch_fn = {
'binary': qp.datasets.fetch_UCIBinaryDataset,
'multiclass': qp.datasets.fetch_UCIMulticlassDataset
}[setup]
data = fetch_fn(dataset)
n_classes[dataset] = data.n_classes
tr_size[dataset] = len(data.training)
# remove datasets with more than max_classes classes
max_classes = 30
for data_name, n in n_classes.items():
if n > max_classes:
df = df[df["dataset"] != data_name]
for region in ['CI', 'CE', 'CLR']:
pv = pd.pivot_table(
df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True
)
pv['n_classes'] = pv.index.map(n_classes).astype('Int64')
pv['tr_size'] = pv.index.map(tr_size).astype('Int64')
pv = pv.drop(columns=[col for col in pv.columns if col[-1] == "All"])
print(f'{setup=}')
print(pv)
print('-'*80)

View File

@ -0,0 +1,95 @@
import os
import warnings
from os.path import join
from pathlib import Path
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from copy import deepcopy as cp
import quapy as qp
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
from BayesianKDEy.full_experiments import experiment, experiment_path, KDEyCLR
from build.lib.quapy.data import LabelledCollection
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier
from quapy.method.base import BinaryQuantifier, BaseQuantifier
from quapy.model_selection import GridSearchQ
from quapy.data import Dataset
# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap
from quapy.functional import strprev
from quapy.method.aggregative import KDEyML, ACC
from quapy.protocol import UPP
import quapy.functional as F
import numpy as np
from tqdm import tqdm
from scipy.stats import dirichlet
from collections import defaultdict
from time import time
from sklearn.base import clone, BaseEstimator
def method():
"""
Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
- name: is a str representing the name of the method (e.g., 'BayesianKDEy')
- quantifier: is the base model (e.g., KDEyML())
- hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]})
- bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
quantifier with optimized hyperparameters
"""
acc_hyper = {}
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()}
# yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True),
# yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper),
return 'BayKDE*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0,
explore_CLR=True,
step_size=.15,
# num_warmup = 5000,
# num_samples = 10_000,
# region='ellipse',
**hyper),
if __name__ == '__main__':
binary = {
'datasets': qp.datasets.UCI_BINARY_DATASETS,
'fetch_fn': qp.datasets.fetch_UCIBinaryDataset,
'sample_size': 500
}
multiclass = {
'datasets': qp.datasets.UCI_MULTICLASS_DATASETS,
'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset,
'sample_size': 1000
}
result_dir = Path('./results')
setup = multiclass
qp.environ['SAMPLE_SIZE'] = setup['sample_size']
data_name = 'digits'
print(f'dataset={data_name}')
data = setup['fetch_fn'](data_name)
is_binary = data.n_classes==2
hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass')
method_name, method, hyper_params, withconf_constructor = method()
hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__)
report = experiment(data, method, method_name, hyper_params, withconf_constructor, hyper_path)
print(f'dataset={data_name}, '
f'method={method_name}: '
f'mae={report["results"]["ae"].mean():.3f}, '
f'coverage={report["results"]["coverage"].mean():.5f}, '
f'amplitude={report["results"]["amplitude"].mean():.5f}, ')

View File

@ -583,8 +583,8 @@ def solve_adjustment(
"""
Function that tries to solve for :math:`p` the equation :math:`q = M p`, where :math:`q` is the vector of
`unadjusted counts` (as estimated, e.g., via classify and count) with :math:`q_i` an estimate of
:math:`P(\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an
estimate of :math:`P(\hat{Y}=y_i|Y=y_j)`.
:math:`P(\\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an
estimate of :math:`P(\\hat{Y}=y_i|Y=y_j)`.
:param class_conditional_rates: array of shape `(n_classes, n_classes,)` with entry `(i,j)` being the estimate
of :math:`P(\hat{Y}=y_i|Y=y_j)`, that is, the probability that an instance that belongs to class :math:`y_j`

View File

@ -33,7 +33,7 @@ class KDEBase:
@classmethod
def _check_bandwidth(cls, bandwidth):
def _check_bandwidth(cls, bandwidth, kernel):
"""
Checks that the bandwidth parameter is correct
@ -43,8 +43,9 @@ class KDEBase:
assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
if isinstance(bandwidth, float):
assert 0 < bandwidth < 1, \
"the bandwidth for KDEy should be in (0,1), since this method models the unit simplex"
assert kernel!='gaussian' or (0 < bandwidth < 1), \
("the bandwidth for a Gaussian kernel in KDEy should be in (0,1), "
"since this method models the unit simplex")
return bandwidth
@classmethod
@ -166,7 +167,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian',
random_state=None):
super().__init__(classifier, fit_classifier, val_split)
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel)
self.kernel = self._check_kernel(kernel)
self.random_state=random_state
@ -246,7 +247,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
super().__init__(classifier, fit_classifier, val_split)
self.divergence = divergence
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian')
self.random_state=random_state
self.montecarlo_trials = montecarlo_trials
@ -333,7 +334,7 @@ class KDEyCS(AggregativeSoftQuantifier):
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1):
super().__init__(classifier, fit_classifier, val_split)
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian')
def gram_matrix_mix_sum(self, X, Y=None):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))

View File

@ -1,4 +1,5 @@
import numpy as np
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator
from sklearn.metrics import confusion_matrix
@ -13,6 +14,7 @@ from abc import ABC, abstractmethod
from scipy.special import softmax, factorial
import copy
from functools import lru_cache
from tqdm import tqdm
"""
This module provides implementation of different types of confidence regions, and the implementation of Bootstrap
@ -399,7 +401,8 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
n_test_samples=500,
confidence_level=0.95,
region='intervals',
random_state=None):
random_state=None,
verbose=False):
assert isinstance(quantifier, AggregativeQuantifier), \
f'base quantifier does not seem to be an instance of {AggregativeQuantifier.__name__}'
@ -416,6 +419,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
self.confidence_level = confidence_level
self.region = region
self.random_state = random_state
self.verbose = verbose
def aggregation_fit(self, classif_predictions, labels):
data = LabelledCollection(classif_predictions, labels, classes=self.classes_)
@ -441,6 +445,24 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
prev_mean, self.confidence = self.aggregate_conf(classif_predictions)
return prev_mean
def aggregate_conf_sequential__(self, classif_predictions: np.ndarray, confidence_level=None):
if confidence_level is None:
confidence_level = self.confidence_level
n_samples = classif_predictions.shape[0]
prevs = []
with qp.util.temp_seed(self.random_state):
for quantifier in self.quantifiers:
for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose):
sample_i = resample(classif_predictions, n_samples=n_samples)
prev_i = quantifier.aggregate(sample_i)
prevs.append(prev_i)
conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region)
prev_estim = conf.point_estimate()
return prev_estim, conf
def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None):
if confidence_level is None:
confidence_level = self.confidence_level
@ -449,10 +471,15 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
prevs = []
with qp.util.temp_seed(self.random_state):
for quantifier in self.quantifiers:
for i in range(self.n_test_samples):
sample_i = resample(classif_predictions, n_samples=n_samples)
prev_i = quantifier.aggregate(sample_i)
prevs.append(prev_i)
results = Parallel(n_jobs=-1)(
delayed(bootstrap_once)(i, classif_predictions, quantifier, n_samples)
for i in range(self.n_test_samples)
)
prevs.extend(results)
# for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose):
# sample_i = resample(classif_predictions, n_samples=n_samples)
# prev_i = quantifier.aggregate(sample_i)
# prevs.append(prev_i)
conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region)
prev_estim = conf.point_estimate()
@ -477,6 +504,13 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
return self.quantifier._classifier_method()
def bootstrap_once(i, classif_predictions, quantifier, n_samples):
idx = np.random.randint(0, len(classif_predictions), n_samples)
sample = classif_predictions[idx]
prev = quantifier.aggregate(sample)
return prev
class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
"""
`Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ method (by Albert Ziegler and Paweł Czyż),

View File

@ -410,7 +410,7 @@ def group_params(param_grid: dict):
"""
classifier_params, quantifier_params = {}, {}
for key, values in param_grid.items():
if key.startswith('classifier__') or key == 'val_split':
if 'classifier__' in key or key == 'val_split':
classifier_params[key] = values
else:
quantifier_params[key] = values