adding exploration in CLR
This commit is contained in:
parent
881e1033f1
commit
23608f2038
|
|
@ -0,0 +1,3 @@
|
|||
- Add other methods that natively provide uncertainty quantification methods?
|
||||
- Explore neighbourhood in the CLR space instead than in the simplex!
|
||||
-
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
from sklearn.base import BaseEstimator
|
||||
import numpy as np
|
||||
from quapy.method._kdey import KDEBase
|
||||
from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC
|
||||
from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC, CLRtransformation
|
||||
from quapy.method.aggregative import AggregativeSoftQuantifier
|
||||
from tqdm import tqdm
|
||||
import quapy.functional as F
|
||||
|
|
@ -40,6 +40,8 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
|
|||
mcmc_seed: int = 0,
|
||||
confidence_level: float = 0.95,
|
||||
region: str = 'intervals',
|
||||
explore_CLR=False,
|
||||
step_size=0.05,
|
||||
verbose: bool = False):
|
||||
|
||||
if num_warmup <= 0:
|
||||
|
|
@ -48,13 +50,15 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
|
|||
raise ValueError(f'parameter {num_samples=} must be a positive integer')
|
||||
|
||||
super().__init__(classifier, fit_classifier, val_split)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel)
|
||||
self.kernel = self._check_kernel(kernel)
|
||||
self.num_warmup = num_warmup
|
||||
self.num_samples = num_samples
|
||||
self.mcmc_seed = mcmc_seed
|
||||
self.confidence_level = confidence_level
|
||||
self.region = region
|
||||
self.explore_CLR = explore_CLR
|
||||
self.step_size = step_size
|
||||
self.verbose = verbose
|
||||
|
||||
def aggregation_fit(self, classif_predictions, labels):
|
||||
|
|
@ -105,10 +109,19 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
|
|||
def log_prior(prev):
|
||||
return 0
|
||||
|
||||
def sample_neighbour(prev, step_size=0.05):
|
||||
def sample_neighbour(prev, step_size):
|
||||
# random-walk Metropolis-Hastings
|
||||
dir_noise = rng.normal(scale=step_size, size=len(prev))
|
||||
neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex')
|
||||
d = len(prev)
|
||||
if not self.explore_CLR:
|
||||
dir_noise = rng.normal(scale=step_size/np.sqrt(d), size=d)
|
||||
neighbour = F.normalize_prevalence(prev + dir_noise, method='mapsimplex')
|
||||
else:
|
||||
clr = CLRtransformation()
|
||||
clr_point = clr(prev)
|
||||
dir_noise = rng.normal(scale=step_size, size=d)
|
||||
clr_neighbour = clr_point+dir_noise
|
||||
neighbour = clr.inverse(clr_neighbour)
|
||||
assert in_simplex(neighbour), 'wrong CLR transformation'
|
||||
return neighbour
|
||||
|
||||
n_classes = X_probs.shape[1]
|
||||
|
|
@ -116,9 +129,9 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
|
|||
current_likelihood = log_likelihood(current_prev) + log_prior(current_prev)
|
||||
|
||||
# Metropolis-Hastings with adaptive rate
|
||||
step_size = 0.05
|
||||
step_size = self.step_size
|
||||
target_acceptance = 0.3
|
||||
adapt_rate = 0.01
|
||||
adapt_rate = 0.05
|
||||
acceptance_history = []
|
||||
|
||||
samples = []
|
||||
|
|
@ -142,7 +155,13 @@ class BayesianKDEy(AggregativeSoftQuantifier, KDEBase, WithConfidenceABC):
|
|||
if i < self.num_warmup and i%10==0 and len(acceptance_history)>=100:
|
||||
recent_accept_rate = np.mean(acceptance_history[-100:])
|
||||
step_size *= np.exp(adapt_rate * (recent_accept_rate - target_acceptance))
|
||||
# step_size = float(np.clip(step_size, min_step, max_step))
|
||||
print(f'acceptance-rate={recent_accept_rate*100:.3f}%, step-size={step_size:.5f}')
|
||||
|
||||
# remove "warmup" initial iterations
|
||||
samples = np.asarray(samples[self.num_warmup:])
|
||||
return samples
|
||||
return samples
|
||||
|
||||
|
||||
def in_simplex(x):
|
||||
return np.all(x >= 0) and np.isclose(x.sum(), 1)
|
||||
|
|
@ -9,8 +9,9 @@ from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
|||
from copy import deepcopy as cp
|
||||
import quapy as qp
|
||||
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
|
||||
from quapy.method.aggregative import DistributionMatchingY as DMy
|
||||
from quapy.method.base import BinaryQuantifier
|
||||
from build.lib.quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier
|
||||
from quapy.method.base import BinaryQuantifier, BaseQuantifier
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.data import Dataset
|
||||
# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
|
||||
|
|
@ -24,73 +25,95 @@ from tqdm import tqdm
|
|||
from scipy.stats import dirichlet
|
||||
from collections import defaultdict
|
||||
from time import time
|
||||
from sklearn.base import clone
|
||||
from sklearn.base import clone, BaseEstimator
|
||||
|
||||
|
||||
# def new_classifier(training):
|
||||
# print('optimizing hyperparameters of Logistic Regression')
|
||||
# mod_sel = GridSearchCV(
|
||||
# estimator=LogisticRegression(),
|
||||
# param_grid={
|
||||
# 'C': np.logspace(-4, 4, 9),
|
||||
# 'class_weight': ['balanced', None]
|
||||
# },
|
||||
# cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=0),
|
||||
# n_jobs=-1,
|
||||
# refit=False,
|
||||
# )
|
||||
# mod_sel.fit(*training.Xy)
|
||||
# # optim = LogisticRegression(**mod_sel.best_params_)
|
||||
# print(f'Done: hyperparameters chosen={mod_sel.best_params_}')
|
||||
# # calib = CalibratedClassifierCV(optim, cv=10, n_jobs=-1, ensemble=False).fit(*training.Xy)
|
||||
# # return calib
|
||||
# return LogisticRegression(**mod_sel.best_params_)
|
||||
class KDEyCLR(KDEyML):
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
|
||||
super().__init__(
|
||||
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
||||
random_state=random_state, kernel='aitchison'
|
||||
)
|
||||
|
||||
def methods():
|
||||
def methods__():
|
||||
acc_hyper = {}
|
||||
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
|
||||
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
|
||||
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2], 'classifier__C':[1]}
|
||||
wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()}
|
||||
# yield 'BootstrapACC', AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), wrap_hyper(acc_hyper)
|
||||
# yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), wrap_hyper(hdy_hyper)
|
||||
# yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper)
|
||||
yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper)
|
||||
# yield 'BayesianACC', BayesianCC(LR(), mcmc_seed=0), acc_hyper
|
||||
yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper
|
||||
# yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper
|
||||
# yield 'BayesianKDEy', BayesianKDEy(LR(), mcmc_seed=0), kdey_hyper
|
||||
|
||||
|
||||
def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict):
|
||||
def methods():
|
||||
"""
|
||||
Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
|
||||
- name: is a str representing the name of the method (e.g., 'BayesianKDEy')
|
||||
- quantifier: is the base model (e.g., KDEyML())
|
||||
- hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]})
|
||||
- bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
|
||||
quantifier with optimized hyperparameters
|
||||
"""
|
||||
acc_hyper = {}
|
||||
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
|
||||
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
|
||||
kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
|
||||
|
||||
yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0),
|
||||
yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0)
|
||||
|
||||
yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0),
|
||||
|
||||
yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True),
|
||||
yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper),
|
||||
yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper),
|
||||
|
||||
|
||||
def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict):
|
||||
with qp.util.temp_seed(0):
|
||||
print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}')
|
||||
# model selection
|
||||
train, test = dataset.train_test
|
||||
train_prevalence = train.prevalence()
|
||||
if len(grid)>0:
|
||||
train, val = train.split_stratified(train_prop=0.6, random_state=0)
|
||||
mod_sel = GridSearchQ(
|
||||
model=method,
|
||||
model=point_quantifier,
|
||||
param_grid=grid,
|
||||
protocol=qp.protocol.UPP(val, repeats=250, random_state=0),
|
||||
refit=True,
|
||||
refit=False,
|
||||
n_jobs=-1,
|
||||
verbose=True
|
||||
).fit(*train.Xy)
|
||||
optim_quantifier = mod_sel.best_model()
|
||||
best_params = mod_sel.best_params_
|
||||
best_score = mod_sel.best_score_
|
||||
tr_time = mod_sel.refit_time_
|
||||
else:
|
||||
t_init = time()
|
||||
method.fit(*train.Xy)
|
||||
tr_time = time() - t_init
|
||||
best_params, best_score = {}, -1
|
||||
optim_quantifier = method
|
||||
best_params = {}
|
||||
|
||||
return best_params
|
||||
|
||||
|
||||
def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method_name:str, grid: dict, withconf_constructor, hyper_choice_path: Path):
|
||||
with qp.util.temp_seed(0):
|
||||
|
||||
training, test = dataset.train_test
|
||||
|
||||
# model selection
|
||||
best_hyperparams = qp.util.pickled_resource(
|
||||
hyper_choice_path, model_selection, training, cp(point_quantifier), grid
|
||||
)
|
||||
|
||||
t_init = time()
|
||||
withconf_quantifier = withconf_constructor(best_hyperparams).fit(*training.Xy)
|
||||
tr_time = time() - t_init
|
||||
|
||||
# test
|
||||
train_prevalence = training.prevalence()
|
||||
results = defaultdict(list)
|
||||
test_generator = UPP(test, repeats=500, random_state=0)
|
||||
test_generator = UPP(test, repeats=100, random_state=0)
|
||||
for i, (sample_X, true_prevalence) in tqdm(enumerate(test_generator()), total=test_generator.total(), desc=f'{method_name} predictions'):
|
||||
t_init = time()
|
||||
point_estimate, region = optim_quantifier.predict_conf(sample_X)
|
||||
point_estimate, region = withconf_quantifier.predict_conf(sample_X)
|
||||
ttime = time()-t_init
|
||||
results['true-prevs'].append(true_prevalence)
|
||||
results['point-estim'].append(point_estimate)
|
||||
|
|
@ -103,9 +126,8 @@ def experiment(dataset: Dataset, method: WithConfidenceABC, grid: dict):
|
|||
results['samples'].append(region.samples)
|
||||
|
||||
report = {
|
||||
'optim_hyper': best_params,
|
||||
'optim_score': best_score,
|
||||
'refit_time': tr_time,
|
||||
'optim_hyper': best_hyperparams,
|
||||
'train_time': tr_time,
|
||||
'train-prev': train_prevalence,
|
||||
'results': {k:np.asarray(v) for k,v in results.items()}
|
||||
}
|
||||
|
|
@ -134,26 +156,30 @@ if __name__ == '__main__':
|
|||
|
||||
result_dir = Path('./results')
|
||||
|
||||
for setup in [binary, multiclass]:
|
||||
for setup in [binary, multiclass]: # [binary, multiclass]:
|
||||
qp.environ['SAMPLE_SIZE'] = setup['sample_size']
|
||||
for data_name in setup['datasets']:
|
||||
print(f'dataset={data_name}')
|
||||
if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"):
|
||||
print(f'skipping dataset: {data_name}')
|
||||
continue
|
||||
# if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"):
|
||||
# print(f'skipping dataset: {data_name}')
|
||||
# continue
|
||||
data = setup['fetch_fn'](data_name)
|
||||
is_binary = data.n_classes==2
|
||||
result_subdir = result_dir / ('binary' if is_binary else 'multiclass')
|
||||
for method_name, method, hyper_params in methods():
|
||||
hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass')
|
||||
for method_name, method, hyper_params, withconf_constructor in methods():
|
||||
if isinstance(method, BinaryQuantifier) and not is_binary:
|
||||
continue
|
||||
result_path = experiment_path(result_subdir, data_name, method_name)
|
||||
report = qp.util.pickled_resource(result_path, experiment, data, method, hyper_params)
|
||||
hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__)
|
||||
report = qp.util.pickled_resource(
|
||||
result_path, experiment, data, method, method_name, hyper_params, withconf_constructor, hyper_path
|
||||
)
|
||||
print(f'dataset={data_name}, '
|
||||
f'method={method_name}: '
|
||||
f'mae={report["results"]["ae"].mean():.3f}, '
|
||||
f'coverage={report["results"]["coverage"].mean():.3f}, '
|
||||
f'amplitude={report["results"]["amplitude"].mean():.3f}, ')
|
||||
f'coverage={report["results"]["coverage"].mean():.5f}, '
|
||||
f'amplitude={report["results"]["amplitude"].mean():.5f}, ')
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,31 +1,112 @@
|
|||
import pickle
|
||||
from collections import defaultdict
|
||||
|
||||
from joblib import Parallel, delayed
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
import quapy as qp
|
||||
from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.width', 2000)
|
||||
pd.set_option('display.max_rows', None)
|
||||
pd.set_option("display.expand_frame_repr", False)
|
||||
pd.set_option("display.precision", 4)
|
||||
pd.set_option("display.float_format", "{:.4f}".format)
|
||||
|
||||
|
||||
def compute_coverage_amplitude(region_constructor):
|
||||
all_samples = results['samples']
|
||||
all_true_prevs = results['true-prevs']
|
||||
|
||||
def process_one(samples, true_prevs):
|
||||
ellipse = region_constructor(samples)
|
||||
return ellipse.coverage(true_prevs), ellipse.montecarlo_proportion()
|
||||
|
||||
out = Parallel(n_jobs=3)(
|
||||
delayed(process_one)(samples, true_prevs)
|
||||
for samples, true_prevs in tqdm(
|
||||
zip(all_samples, all_true_prevs),
|
||||
total=len(all_samples),
|
||||
desc='constructing ellipses'
|
||||
)
|
||||
)
|
||||
|
||||
# unzip results
|
||||
coverage, amplitude = zip(*out)
|
||||
return list(coverage), list(amplitude)
|
||||
|
||||
|
||||
def update_pickle(report, pickle_path, updated_dict:dict):
|
||||
for k,v in updated_dict.items():
|
||||
report[k]=v
|
||||
pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
for setup in ['binary', 'multiclass']:
|
||||
path = f'./results/{setup}/*.pkl'
|
||||
table = defaultdict(list)
|
||||
for file in glob(path):
|
||||
for file in tqdm(glob(path), desc='processing results', total=len(glob(path))):
|
||||
file = Path(file)
|
||||
dataset, method = file.name.replace('.pkl', '').split('__')
|
||||
report = pickle.load(open(file, 'rb'))
|
||||
results = report['results']
|
||||
n_samples = len(results['ae'])
|
||||
table['method'].extend([method] * n_samples)
|
||||
table['method'].extend([method.replace('Bayesian','Ba').replace('Bootstrap', 'Bo')] * n_samples)
|
||||
table['dataset'].extend([dataset] * n_samples)
|
||||
table['ae'].extend(results['ae'])
|
||||
table['coverage'].extend(results['coverage'])
|
||||
table['amplitude'].extend(results['amplitude'])
|
||||
table['c-CI'].extend(results['coverage'])
|
||||
table['a-CI'].extend(results['amplitude'])
|
||||
|
||||
if 'coverage-CE' not in report:
|
||||
covCE, ampCE = compute_coverage_amplitude(ConfidenceEllipseSimplex)
|
||||
covCLR, ampCLR = compute_coverage_amplitude(ConfidenceEllipseCLR)
|
||||
|
||||
update_fields = {
|
||||
'coverage-CE': covCE,
|
||||
'amplitude-CE': ampCE,
|
||||
'coverage-CLR': covCLR,
|
||||
'amplitude-CLR': ampCLR
|
||||
}
|
||||
|
||||
update_pickle(report, file, update_fields)
|
||||
|
||||
table['c-CE'].extend(report['coverage-CE'])
|
||||
table['a-CE'].extend(report['amplitude-CE'])
|
||||
|
||||
table['c-CLR'].extend(report['coverage-CLR'])
|
||||
table['a-CLR'].extend(report['amplitude-CLR'])
|
||||
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.width', 1000)
|
||||
pd.set_option('display.max_rows', None)
|
||||
df = pd.DataFrame(table)
|
||||
pv = pd.pivot_table(df, index='dataset', columns='method', values=['ae', 'coverage', 'amplitude'])
|
||||
print(f'{setup=}')
|
||||
print(pv)
|
||||
print()
|
||||
|
||||
n_classes = {}
|
||||
tr_size = {}
|
||||
for dataset in df['dataset'].unique():
|
||||
fetch_fn = {
|
||||
'binary': qp.datasets.fetch_UCIBinaryDataset,
|
||||
'multiclass': qp.datasets.fetch_UCIMulticlassDataset
|
||||
}[setup]
|
||||
data = fetch_fn(dataset)
|
||||
n_classes[dataset] = data.n_classes
|
||||
tr_size[dataset] = len(data.training)
|
||||
|
||||
# remove datasets with more than max_classes classes
|
||||
max_classes = 30
|
||||
for data_name, n in n_classes.items():
|
||||
if n > max_classes:
|
||||
df = df[df["dataset"] != data_name]
|
||||
|
||||
for region in ['CI', 'CE', 'CLR']:
|
||||
pv = pd.pivot_table(
|
||||
df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True
|
||||
)
|
||||
pv['n_classes'] = pv.index.map(n_classes).astype('Int64')
|
||||
pv['tr_size'] = pv.index.map(tr_size).astype('Int64')
|
||||
pv = pv.drop(columns=[col for col in pv.columns if col[-1] == "All"])
|
||||
print(f'{setup=}')
|
||||
print(pv)
|
||||
print('-'*80)
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,95 @@
|
|||
import os
|
||||
import warnings
|
||||
from os.path import join
|
||||
from pathlib import Path
|
||||
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.linear_model import LogisticRegression as LR
|
||||
from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
||||
from copy import deepcopy as cp
|
||||
import quapy as qp
|
||||
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
|
||||
from BayesianKDEy.full_experiments import experiment, experiment_path, KDEyCLR
|
||||
from build.lib.quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier
|
||||
from quapy.method.base import BinaryQuantifier, BaseQuantifier
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.data import Dataset
|
||||
# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
|
||||
from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap
|
||||
from quapy.functional import strprev
|
||||
from quapy.method.aggregative import KDEyML, ACC
|
||||
from quapy.protocol import UPP
|
||||
import quapy.functional as F
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from scipy.stats import dirichlet
|
||||
from collections import defaultdict
|
||||
from time import time
|
||||
from sklearn.base import clone, BaseEstimator
|
||||
|
||||
|
||||
def method():
|
||||
"""
|
||||
Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
|
||||
- name: is a str representing the name of the method (e.g., 'BayesianKDEy')
|
||||
- quantifier: is the base model (e.g., KDEyML())
|
||||
- hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]})
|
||||
- bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
|
||||
quantifier with optimized hyperparameters
|
||||
"""
|
||||
acc_hyper = {}
|
||||
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
|
||||
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
|
||||
kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
|
||||
|
||||
wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()}
|
||||
|
||||
# yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True),
|
||||
# yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper),
|
||||
return 'BayKDE*CLR', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0,
|
||||
explore_CLR=True,
|
||||
step_size=.15,
|
||||
# num_warmup = 5000,
|
||||
# num_samples = 10_000,
|
||||
# region='ellipse',
|
||||
**hyper),
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
binary = {
|
||||
'datasets': qp.datasets.UCI_BINARY_DATASETS,
|
||||
'fetch_fn': qp.datasets.fetch_UCIBinaryDataset,
|
||||
'sample_size': 500
|
||||
}
|
||||
|
||||
multiclass = {
|
||||
'datasets': qp.datasets.UCI_MULTICLASS_DATASETS,
|
||||
'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset,
|
||||
'sample_size': 1000
|
||||
}
|
||||
|
||||
result_dir = Path('./results')
|
||||
|
||||
setup = multiclass
|
||||
qp.environ['SAMPLE_SIZE'] = setup['sample_size']
|
||||
data_name = 'digits'
|
||||
print(f'dataset={data_name}')
|
||||
data = setup['fetch_fn'](data_name)
|
||||
is_binary = data.n_classes==2
|
||||
hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass')
|
||||
method_name, method, hyper_params, withconf_constructor = method()
|
||||
hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__)
|
||||
report = experiment(data, method, method_name, hyper_params, withconf_constructor, hyper_path)
|
||||
|
||||
print(f'dataset={data_name}, '
|
||||
f'method={method_name}: '
|
||||
f'mae={report["results"]["ae"].mean():.3f}, '
|
||||
f'coverage={report["results"]["coverage"].mean():.5f}, '
|
||||
f'amplitude={report["results"]["amplitude"].mean():.5f}, ')
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -583,8 +583,8 @@ def solve_adjustment(
|
|||
"""
|
||||
Function that tries to solve for :math:`p` the equation :math:`q = M p`, where :math:`q` is the vector of
|
||||
`unadjusted counts` (as estimated, e.g., via classify and count) with :math:`q_i` an estimate of
|
||||
:math:`P(\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an
|
||||
estimate of :math:`P(\hat{Y}=y_i|Y=y_j)`.
|
||||
:math:`P(\\hat{Y}=y_i)`, and where :math:`M` is the matrix of `class-conditional rates` with :math:`M_{ij}` an
|
||||
estimate of :math:`P(\\hat{Y}=y_i|Y=y_j)`.
|
||||
|
||||
:param class_conditional_rates: array of shape `(n_classes, n_classes,)` with entry `(i,j)` being the estimate
|
||||
of :math:`P(\hat{Y}=y_i|Y=y_j)`, that is, the probability that an instance that belongs to class :math:`y_j`
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ class KDEBase:
|
|||
|
||||
|
||||
@classmethod
|
||||
def _check_bandwidth(cls, bandwidth):
|
||||
def _check_bandwidth(cls, bandwidth, kernel):
|
||||
"""
|
||||
Checks that the bandwidth parameter is correct
|
||||
|
||||
|
|
@ -43,8 +43,9 @@ class KDEBase:
|
|||
assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
|
||||
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
|
||||
if isinstance(bandwidth, float):
|
||||
assert 0 < bandwidth < 1, \
|
||||
"the bandwidth for KDEy should be in (0,1), since this method models the unit simplex"
|
||||
assert kernel!='gaussian' or (0 < bandwidth < 1), \
|
||||
("the bandwidth for a Gaussian kernel in KDEy should be in (0,1), "
|
||||
"since this method models the unit simplex")
|
||||
return bandwidth
|
||||
|
||||
@classmethod
|
||||
|
|
@ -166,7 +167,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
|||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian',
|
||||
random_state=None):
|
||||
super().__init__(classifier, fit_classifier, val_split)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel)
|
||||
self.kernel = self._check_kernel(kernel)
|
||||
self.random_state=random_state
|
||||
|
||||
|
|
@ -246,7 +247,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
|
|||
|
||||
super().__init__(classifier, fit_classifier, val_split)
|
||||
self.divergence = divergence
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian')
|
||||
self.random_state=random_state
|
||||
self.montecarlo_trials = montecarlo_trials
|
||||
|
||||
|
|
@ -333,7 +334,7 @@ class KDEyCS(AggregativeSoftQuantifier):
|
|||
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1):
|
||||
super().__init__(classifier, fit_classifier, val_split)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth, kernel='gaussian')
|
||||
|
||||
def gram_matrix_mix_sum(self, X, Y=None):
|
||||
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.metrics import confusion_matrix
|
||||
|
||||
|
|
@ -13,6 +14,7 @@ from abc import ABC, abstractmethod
|
|||
from scipy.special import softmax, factorial
|
||||
import copy
|
||||
from functools import lru_cache
|
||||
from tqdm import tqdm
|
||||
|
||||
"""
|
||||
This module provides implementation of different types of confidence regions, and the implementation of Bootstrap
|
||||
|
|
@ -399,7 +401,8 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
|||
n_test_samples=500,
|
||||
confidence_level=0.95,
|
||||
region='intervals',
|
||||
random_state=None):
|
||||
random_state=None,
|
||||
verbose=False):
|
||||
|
||||
assert isinstance(quantifier, AggregativeQuantifier), \
|
||||
f'base quantifier does not seem to be an instance of {AggregativeQuantifier.__name__}'
|
||||
|
|
@ -416,6 +419,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
|||
self.confidence_level = confidence_level
|
||||
self.region = region
|
||||
self.random_state = random_state
|
||||
self.verbose = verbose
|
||||
|
||||
def aggregation_fit(self, classif_predictions, labels):
|
||||
data = LabelledCollection(classif_predictions, labels, classes=self.classes_)
|
||||
|
|
@ -441,6 +445,24 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
|||
prev_mean, self.confidence = self.aggregate_conf(classif_predictions)
|
||||
return prev_mean
|
||||
|
||||
def aggregate_conf_sequential__(self, classif_predictions: np.ndarray, confidence_level=None):
|
||||
if confidence_level is None:
|
||||
confidence_level = self.confidence_level
|
||||
|
||||
n_samples = classif_predictions.shape[0]
|
||||
prevs = []
|
||||
with qp.util.temp_seed(self.random_state):
|
||||
for quantifier in self.quantifiers:
|
||||
for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose):
|
||||
sample_i = resample(classif_predictions, n_samples=n_samples)
|
||||
prev_i = quantifier.aggregate(sample_i)
|
||||
prevs.append(prev_i)
|
||||
|
||||
conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region)
|
||||
prev_estim = conf.point_estimate()
|
||||
|
||||
return prev_estim, conf
|
||||
|
||||
def aggregate_conf(self, classif_predictions: np.ndarray, confidence_level=None):
|
||||
if confidence_level is None:
|
||||
confidence_level = self.confidence_level
|
||||
|
|
@ -449,10 +471,15 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
|||
prevs = []
|
||||
with qp.util.temp_seed(self.random_state):
|
||||
for quantifier in self.quantifiers:
|
||||
for i in range(self.n_test_samples):
|
||||
sample_i = resample(classif_predictions, n_samples=n_samples)
|
||||
prev_i = quantifier.aggregate(sample_i)
|
||||
prevs.append(prev_i)
|
||||
results = Parallel(n_jobs=-1)(
|
||||
delayed(bootstrap_once)(i, classif_predictions, quantifier, n_samples)
|
||||
for i in range(self.n_test_samples)
|
||||
)
|
||||
prevs.extend(results)
|
||||
# for i in tqdm(range(self.n_test_samples), desc='resampling', total=self.n_test_samples, disable=not self.verbose):
|
||||
# sample_i = resample(classif_predictions, n_samples=n_samples)
|
||||
# prev_i = quantifier.aggregate(sample_i)
|
||||
# prevs.append(prev_i)
|
||||
|
||||
conf = WithConfidenceABC.construct_region(prevs, confidence_level, method=self.region)
|
||||
prev_estim = conf.point_estimate()
|
||||
|
|
@ -477,6 +504,13 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier):
|
|||
return self.quantifier._classifier_method()
|
||||
|
||||
|
||||
def bootstrap_once(i, classif_predictions, quantifier, n_samples):
|
||||
idx = np.random.randint(0, len(classif_predictions), n_samples)
|
||||
sample = classif_predictions[idx]
|
||||
prev = quantifier.aggregate(sample)
|
||||
return prev
|
||||
|
||||
|
||||
class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
|
||||
"""
|
||||
`Bayesian quantification <https://arxiv.org/abs/2302.09159>`_ method (by Albert Ziegler and Paweł Czyż),
|
||||
|
|
|
|||
|
|
@ -410,7 +410,7 @@ def group_params(param_grid: dict):
|
|||
"""
|
||||
classifier_params, quantifier_params = {}, {}
|
||||
for key, values in param_grid.items():
|
||||
if key.startswith('classifier__') or key == 'val_split':
|
||||
if 'classifier__' in key or key == 'val_split':
|
||||
classifier_params[key] = values
|
||||
else:
|
||||
quantifier_params[key] = values
|
||||
|
|
|
|||
Loading…
Reference in New Issue