QuaPy/BayesianKDEy/generate_results.py

113 lines
3.8 KiB
Python

import pickle
from collections import defaultdict
from joblib import Parallel, delayed
from tqdm import tqdm
import pandas as pd
from glob import glob
from pathlib import Path
import quapy as qp
from quapy.method.confidence import ConfidenceEllipseSimplex, ConfidenceEllipseCLR
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.precision", 4)
pd.set_option("display.float_format", "{:.4f}".format)
def compute_coverage_amplitude(region_constructor):
all_samples = results['samples']
all_true_prevs = results['true-prevs']
def process_one(samples, true_prevs):
ellipse = region_constructor(samples)
return ellipse.coverage(true_prevs), ellipse.montecarlo_proportion()
out = Parallel(n_jobs=3)(
delayed(process_one)(samples, true_prevs)
for samples, true_prevs in tqdm(
zip(all_samples, all_true_prevs),
total=len(all_samples),
desc='constructing ellipses'
)
)
# unzip results
coverage, amplitude = zip(*out)
return list(coverage), list(amplitude)
def update_pickle(report, pickle_path, updated_dict:dict):
for k,v in updated_dict.items():
report[k]=v
pickle.dump(report, open(pickle_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
for setup in ['binary', 'multiclass']:
path = f'./results/{setup}/*.pkl'
table = defaultdict(list)
for file in tqdm(glob(path), desc='processing results', total=len(glob(path))):
file = Path(file)
dataset, method = file.name.replace('.pkl', '').split('__')
report = pickle.load(open(file, 'rb'))
results = report['results']
n_samples = len(results['ae'])
table['method'].extend([method.replace('Bayesian','Ba').replace('Bootstrap', 'Bo')] * n_samples)
table['dataset'].extend([dataset] * n_samples)
table['ae'].extend(results['ae'])
table['c-CI'].extend(results['coverage'])
table['a-CI'].extend(results['amplitude'])
if 'coverage-CE' not in report:
covCE, ampCE = compute_coverage_amplitude(ConfidenceEllipseSimplex)
covCLR, ampCLR = compute_coverage_amplitude(ConfidenceEllipseCLR)
update_fields = {
'coverage-CE': covCE,
'amplitude-CE': ampCE,
'coverage-CLR': covCLR,
'amplitude-CLR': ampCLR
}
update_pickle(report, file, update_fields)
table['c-CE'].extend(report['coverage-CE'])
table['a-CE'].extend(report['amplitude-CE'])
table['c-CLR'].extend(report['coverage-CLR'])
table['a-CLR'].extend(report['amplitude-CLR'])
df = pd.DataFrame(table)
n_classes = {}
tr_size = {}
for dataset in df['dataset'].unique():
fetch_fn = {
'binary': qp.datasets.fetch_UCIBinaryDataset,
'multiclass': qp.datasets.fetch_UCIMulticlassDataset
}[setup]
data = fetch_fn(dataset)
n_classes[dataset] = data.n_classes
tr_size[dataset] = len(data.training)
# remove datasets with more than max_classes classes
max_classes = 30
for data_name, n in n_classes.items():
if n > max_classes:
df = df[df["dataset"] != data_name]
for region in ['CI', 'CE', 'CLR']:
pv = pd.pivot_table(
df, index='dataset', columns='method', values=['ae', f'c-{region}', f'a-{region}'], margins=True
)
pv['n_classes'] = pv.index.map(n_classes).astype('Int64')
pv['tr_size'] = pv.index.map(tr_size).astype('Int64')
pv = pv.drop(columns=[col for col in pv.columns if col[-1] == "All"])
print(f'{setup=}')
print(pv)
print('-'*80)