187 lines
8.4 KiB
Python
187 lines
8.4 KiB
Python
import os
|
|
import warnings
|
|
from os.path import join
|
|
from pathlib import Path
|
|
|
|
from sklearn.calibration import CalibratedClassifierCV
|
|
from sklearn.linear_model import LogisticRegression as LR
|
|
from sklearn.model_selection import GridSearchCV, StratifiedKFold
|
|
from copy import deepcopy as cp
|
|
import quapy as qp
|
|
from BayesianKDEy._bayeisan_kdey import BayesianKDEy
|
|
from build.lib.quapy.data import LabelledCollection
|
|
from quapy.method.aggregative import DistributionMatchingY as DMy, AggregativeQuantifier
|
|
from quapy.method.base import BinaryQuantifier, BaseQuantifier
|
|
from quapy.model_selection import GridSearchQ
|
|
from quapy.data import Dataset
|
|
# from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot
|
|
from quapy.method.confidence import ConfidenceIntervals, BayesianCC, PQ, WithConfidenceABC, AggregativeBootstrap
|
|
from quapy.functional import strprev
|
|
from quapy.method.aggregative import KDEyML, ACC
|
|
from quapy.protocol import UPP
|
|
import quapy.functional as F
|
|
import numpy as np
|
|
from tqdm import tqdm
|
|
from scipy.stats import dirichlet
|
|
from collections import defaultdict
|
|
from time import time
|
|
from sklearn.base import clone, BaseEstimator
|
|
|
|
|
|
class KDEyCLR(KDEyML):
|
|
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=1., random_state=None):
|
|
super().__init__(
|
|
classifier=classifier, fit_classifier=fit_classifier, val_split=val_split, bandwidth=bandwidth,
|
|
random_state=random_state, kernel='aitchison'
|
|
)
|
|
|
|
def methods__():
|
|
acc_hyper = {}
|
|
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
|
|
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2], 'classifier__C':[1]}
|
|
wrap_hyper = lambda dic: {f'quantifier__{k}':v for k,v in dic.items()}
|
|
# yield 'BootstrapACC', AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0), wrap_hyper(acc_hyper)
|
|
# yield 'BootstrapHDy', AggregativeBootstrap(DMy(LR(), divergence='HD'), n_test_samples=1000, random_state=0), wrap_hyper(hdy_hyper)
|
|
yield 'BootstrapKDEy', AggregativeBootstrap(KDEyML(LR()), n_test_samples=1000, random_state=0), wrap_hyper(kdey_hyper)
|
|
# yield 'BayesianACC', BayesianCC(LR(), mcmc_seed=0), acc_hyper
|
|
# yield 'BayesianHDy', PQ(LR(), stan_seed=0), hdy_hyper
|
|
# yield 'BayesianKDEy', BayesianKDEy(LR(), mcmc_seed=0), kdey_hyper
|
|
|
|
|
|
def methods():
|
|
"""
|
|
Returns a tuple (name, quantifier, hyperparams, bayesian/bootstrap_constructor), where:
|
|
- name: is a str representing the name of the method (e.g., 'BayesianKDEy')
|
|
- quantifier: is the base model (e.g., KDEyML())
|
|
- hyperparams: is a dictionary for the quantifier (e.g., {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]})
|
|
- bayesian/bootstrap_constructor: is a function that instantiates the bayesian o bootstrap method with the
|
|
quantifier with optimized hyperparameters
|
|
"""
|
|
acc_hyper = {}
|
|
hdy_hyper = {'nbins': [3,4,5,8,16,32]}
|
|
kdey_hyper = {'bandwidth': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}
|
|
kdey_hyper_clr = {'bandwidth': [0.05, 0.1, 0.5, 1., 2., 5.]}
|
|
|
|
yield 'BootstrapACC', ACC(LR()), acc_hyper, lambda hyper: AggregativeBootstrap(ACC(LR()), n_test_samples=1000, random_state=0),
|
|
yield 'BayesianACC', ACC(LR()), acc_hyper, lambda hyper: BayesianCC(LR(), mcmc_seed=0)
|
|
|
|
yield 'BootstrapHDy', DMy(LR()), hdy_hyper, lambda hyper: AggregativeBootstrap(DMy(LR(), **hyper), n_test_samples=1000, random_state=0),
|
|
|
|
yield 'BootstrapKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: AggregativeBootstrap(KDEyML(LR(), **hyper), n_test_samples=1000, random_state=0, verbose=True),
|
|
yield 'BayesianKDEy', KDEyML(LR()), kdey_hyper, lambda hyper: BayesianKDEy(mcmc_seed=0, **hyper),
|
|
yield 'BayesianKDEy*', KDEyCLR(LR()), kdey_hyper_clr, lambda hyper: BayesianKDEy(kernel='aitchison', mcmc_seed=0, **hyper),
|
|
|
|
|
|
def model_selection(train: LabelledCollection, point_quantifier: AggregativeQuantifier, grid: dict):
|
|
with qp.util.temp_seed(0):
|
|
print(f'performing model selection for {point_quantifier.__class__.__name__} with grid {grid}')
|
|
# model selection
|
|
if len(grid)>0:
|
|
train, val = train.split_stratified(train_prop=0.6, random_state=0)
|
|
mod_sel = GridSearchQ(
|
|
model=point_quantifier,
|
|
param_grid=grid,
|
|
protocol=qp.protocol.UPP(val, repeats=250, random_state=0),
|
|
refit=False,
|
|
n_jobs=-1,
|
|
verbose=True
|
|
).fit(*train.Xy)
|
|
best_params = mod_sel.best_params_
|
|
else:
|
|
best_params = {}
|
|
|
|
return best_params
|
|
|
|
|
|
def experiment(dataset: Dataset, point_quantifier: AggregativeQuantifier, method_name:str, grid: dict, withconf_constructor, hyper_choice_path: Path):
|
|
with qp.util.temp_seed(0):
|
|
|
|
training, test = dataset.train_test
|
|
|
|
# model selection
|
|
best_hyperparams = qp.util.pickled_resource(
|
|
hyper_choice_path, model_selection, training, cp(point_quantifier), grid
|
|
)
|
|
|
|
t_init = time()
|
|
withconf_quantifier = withconf_constructor(best_hyperparams).fit(*training.Xy)
|
|
tr_time = time() - t_init
|
|
|
|
# test
|
|
train_prevalence = training.prevalence()
|
|
results = defaultdict(list)
|
|
test_generator = UPP(test, repeats=100, random_state=0)
|
|
for i, (sample_X, true_prevalence) in tqdm(enumerate(test_generator()), total=test_generator.total(), desc=f'{method_name} predictions'):
|
|
t_init = time()
|
|
point_estimate, region = withconf_quantifier.predict_conf(sample_X)
|
|
ttime = time()-t_init
|
|
results['true-prevs'].append(true_prevalence)
|
|
results['point-estim'].append(point_estimate)
|
|
results['shift'].append(qp.error.ae(true_prevalence, train_prevalence))
|
|
results['ae'].append(qp.error.ae(prevs_true=true_prevalence, prevs_hat=point_estimate))
|
|
results['rae'].append(qp.error.rae(prevs_true=true_prevalence, prevs_hat=point_estimate))
|
|
results['coverage'].append(region.coverage(true_prevalence))
|
|
results['amplitude'].append(region.montecarlo_proportion(n_trials=50_000))
|
|
results['test-time'].append(ttime)
|
|
results['samples'].append(region.samples)
|
|
|
|
report = {
|
|
'optim_hyper': best_hyperparams,
|
|
'train_time': tr_time,
|
|
'train-prev': train_prevalence,
|
|
'results': {k:np.asarray(v) for k,v in results.items()}
|
|
}
|
|
|
|
return report
|
|
|
|
|
|
def experiment_path(dir:Path, dataset_name:str, method_name:str):
|
|
os.makedirs(dir, exist_ok=True)
|
|
return dir/f'{dataset_name}__{method_name}.pkl'
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
binary = {
|
|
'datasets': qp.datasets.UCI_BINARY_DATASETS,
|
|
'fetch_fn': qp.datasets.fetch_UCIBinaryDataset,
|
|
'sample_size': 500
|
|
}
|
|
|
|
multiclass = {
|
|
'datasets': qp.datasets.UCI_MULTICLASS_DATASETS,
|
|
'fetch_fn': qp.datasets.fetch_UCIMulticlassDataset,
|
|
'sample_size': 1000
|
|
}
|
|
|
|
result_dir = Path('./results')
|
|
|
|
for setup in [binary, multiclass]: # [binary, multiclass]:
|
|
qp.environ['SAMPLE_SIZE'] = setup['sample_size']
|
|
for data_name in setup['datasets']:
|
|
print(f'dataset={data_name}')
|
|
# if data_name=='breast-cancer' or data_name.startswith("cmc") or data_name.startswith("ctg"):
|
|
# print(f'skipping dataset: {data_name}')
|
|
# continue
|
|
data = setup['fetch_fn'](data_name)
|
|
is_binary = data.n_classes==2
|
|
result_subdir = result_dir / ('binary' if is_binary else 'multiclass')
|
|
hyper_subdir = result_dir / 'hyperparams' / ('binary' if is_binary else 'multiclass')
|
|
for method_name, method, hyper_params, withconf_constructor in methods():
|
|
if isinstance(method, BinaryQuantifier) and not is_binary:
|
|
continue
|
|
result_path = experiment_path(result_subdir, data_name, method_name)
|
|
hyper_path = experiment_path(hyper_subdir, data_name, method.__class__.__name__)
|
|
report = qp.util.pickled_resource(
|
|
result_path, experiment, data, method, method_name, hyper_params, withconf_constructor, hyper_path
|
|
)
|
|
print(f'dataset={data_name}, '
|
|
f'method={method_name}: '
|
|
f'mae={report["results"]["ae"].mean():.3f}, '
|
|
f'coverage={report["results"]["coverage"].mean():.5f}, '
|
|
f'amplitude={report["results"]["amplitude"].mean():.5f}, ')
|
|
|
|
|
|
|
|
|