generating plots for overview paper
This commit is contained in:
parent
5bbaf42a0e
commit
996bfd82f4
|
@ -2,7 +2,8 @@ import os
|
||||||
from os.path import join
|
from os.path import join
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from scripts.data import load_vector_documents
|
from LeQua2024.scripts.data import load_vector_documents
|
||||||
|
from LeQua2024.scripts.constants import SAMPLE_SIZE
|
||||||
from quapy.data.base import LabelledCollection
|
from quapy.data.base import LabelledCollection
|
||||||
import sys
|
import sys
|
||||||
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||||
|
@ -16,57 +17,22 @@ import matplotlib.pyplot as plt
|
||||||
# import seaborn as sns
|
# import seaborn as sns
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import glob
|
import glob
|
||||||
from scripts.constants import SAMPLE_SIZE
|
from commons import *
|
||||||
|
|
||||||
|
|
||||||
# os.chdir('/home/moreo/QuaPy/LeQua2024')
|
for TASK in [1,2,4]:
|
||||||
# print(os.getcwd())
|
|
||||||
|
|
||||||
TASK=2
|
|
||||||
qp.environ['SAMPLE_SIZE']=SAMPLE_SIZE[f'T{TASK}']
|
qp.environ['SAMPLE_SIZE']=SAMPLE_SIZE[f'T{TASK}']
|
||||||
|
|
||||||
true_prevs_path = f'../TruePrevalences/T{TASK}.test_prevalences/T{TASK}/public/test_prevalences.txt'
|
true_prevs_path = f'../TruePrevalences/T{TASK}.test_prevalences/T{TASK}/public/test_prevalences.txt'
|
||||||
folder = F'../Results_CODALAB_2024/extracted/TASK_{TASK}'
|
folder = F'../Results_CODALAB_2024/extracted/TASK_{TASK}'
|
||||||
|
|
||||||
def load_result_file(path):
|
|
||||||
df = pd.read_csv(path, index_col=0)
|
|
||||||
id = df.index.to_numpy()
|
|
||||||
prevs = df.values
|
|
||||||
return id, prevs
|
|
||||||
|
|
||||||
|
|
||||||
method_files = glob.glob(f"{folder}/*.csv")
|
method_files = glob.glob(f"{folder}/*.csv")
|
||||||
|
|
||||||
|
desired_order = desired_order_dict[TASK]
|
||||||
method_names_nice={
|
|
||||||
'DistMatching-y': 'DM',
|
|
||||||
'TeamGMNet': 'UniOviedo(Team1)',
|
|
||||||
'tobiaslotz': 'Lamarr'
|
|
||||||
}
|
|
||||||
|
|
||||||
exclude_methods=[
|
|
||||||
'TeamCUFE',
|
|
||||||
'hustav',
|
|
||||||
'PCC',
|
|
||||||
'CC'
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# desired_order=[
|
|
||||||
# 'Lamarr',
|
|
||||||
# 'SLD',
|
|
||||||
# 'DM',
|
|
||||||
# 'KDEy',
|
|
||||||
# 'UniOviedo(Team1)'
|
|
||||||
# ]
|
|
||||||
# desired_order=[
|
|
||||||
# 'PCC', 'Lamarr'
|
|
||||||
# ]
|
|
||||||
|
|
||||||
# load the true values (sentiment prevalence, domain prevalence)
|
# load the true values (sentiment prevalence, domain prevalence)
|
||||||
true_id, true_prevs = load_result_file(true_prevs_path)
|
true_id, true_prevs = load_result_file(true_prevs_path)
|
||||||
|
|
||||||
|
|
||||||
# define the loss for evaluation
|
# define the loss for evaluation
|
||||||
error_name = 'RAE'
|
error_name = 'RAE'
|
||||||
error_log = False
|
error_log = False
|
||||||
|
@ -96,26 +62,52 @@ from quapy.plot import error_by_drift, binary_diagonal
|
||||||
method_names, estim_prevs = [], []
|
method_names, estim_prevs = [], []
|
||||||
for method_file in method_files:
|
for method_file in method_files:
|
||||||
method_name = Path(method_file).name.replace('.csv', '')
|
method_name = Path(method_file).name.replace('.csv', '')
|
||||||
if method_name in exclude_methods:
|
# if method_name in exclude_methods:
|
||||||
continue
|
# continue
|
||||||
id, method_prevs = load_result_file(join(folder, method_name+'.csv'))
|
id, method_prevs = load_result_file(join(folder, method_name+'.csv'))
|
||||||
assert (true_id == id).all(), f'unmatched files for {method_file}'
|
assert (true_id == id).all(), f'unmatched files for {method_file}'
|
||||||
method_name = method_names_nice.get(method_name, method_name)
|
method_name = method_names_nice.get(method_name, method_name)
|
||||||
|
if method_name not in desired_order:
|
||||||
|
print(f'method {method_name} unknown')
|
||||||
|
raise ValueError()
|
||||||
method_names.append(method_name)
|
method_names.append(method_name)
|
||||||
estim_prevs.append(method_prevs)
|
estim_prevs.append(method_prevs)
|
||||||
|
|
||||||
|
plt.rcParams['figure.figsize'] = [14, 6]
|
||||||
|
plt.rcParams['figure.dpi'] = 200
|
||||||
|
plt.rcParams['font.size'] = 15
|
||||||
|
|
||||||
true_prevs = [true_prevs]*len(method_names)
|
true_prevs = [true_prevs]*len(method_names)
|
||||||
savepath = f'./t{TASK}_diagonal.png'
|
savepath = f'./t{TASK}_diagonal.png'
|
||||||
if TASK==1:
|
if TASK in [1,4]:
|
||||||
binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True,
|
binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True,
|
||||||
train_prev=train.prevalence(), savepath=savepath, method_order=None)
|
train_prev=train.prevalence(), savepath=savepath, method_order=desired_order)
|
||||||
|
|
||||||
|
|
||||||
|
box_to_ancor={
|
||||||
|
1: (0.88,0.1),
|
||||||
|
2: (0.9,0.15),
|
||||||
|
4: (0.9, 0.15),
|
||||||
|
}
|
||||||
|
|
||||||
tr_prevs =[train.prevalence()]*len(method_names)
|
tr_prevs =[train.prevalence()]*len(method_names)
|
||||||
savepath = f'./t{TASK}_{error_name}_pps.png'
|
savepath = f'./t{TASK}_{error_name}_pps.png'
|
||||||
|
binary=TASK in [1,4]
|
||||||
|
if binary:
|
||||||
|
print(f'{TASK=} has positive prevalence = {train.prevalence()[1]}')
|
||||||
error_by_drift(method_names,
|
error_by_drift(method_names,
|
||||||
true_prevs,
|
true_prevs,
|
||||||
estim_prevs,
|
estim_prevs,
|
||||||
tr_prevs, title=None,
|
tr_prevs,
|
||||||
error_name='rae', show_std=True, n_bins=1000,
|
title=None,
|
||||||
show_density=True, vlines=[tr_prevs[0][1]], savepath=savepath)
|
y_error_name='rae',
|
||||||
|
x_error_name='bias_binary' if binary else 'ae',
|
||||||
|
x_axis_title=f'PPS between training set and test sample (in terms of bias)' if binary else None,
|
||||||
|
show_std=False,
|
||||||
|
n_bins=25,
|
||||||
|
logscale=True if binary else False,
|
||||||
|
show_density=True,
|
||||||
|
method_order=desired_order,
|
||||||
|
vlines=list(train.prevalence()) if binary else None,
|
||||||
|
bbox_to_anchor=box_to_ancor[TASK],
|
||||||
|
savepath=savepath)
|
||||||
|
|
|
@ -44,6 +44,28 @@ def acce(y_true, y_pred):
|
||||||
"""
|
"""
|
||||||
return 1. - (y_true == y_pred).mean()
|
return 1. - (y_true == y_pred).mean()
|
||||||
|
|
||||||
|
def bias_binary(prevs, prevs_hat):
|
||||||
|
"""
|
||||||
|
Computes the (positive) bias in a binary problem. The bias is simply the difference between the
|
||||||
|
predicted positive value and the true positive value, so that a positive such value indicates the
|
||||||
|
prediction has positive bias (i.e., it tends to overestimate) the true value, and negative otherwise.
|
||||||
|
:math:`bias(p,\\hat{p})=\\hat{p}_1-p_1`,
|
||||||
|
:param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
|
||||||
|
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
|
||||||
|
prevalence values
|
||||||
|
:return: binary bias
|
||||||
|
"""
|
||||||
|
assert prevs.shape[-1] == 2 and prevs.shape[-1] == 2, f'bias_binary can only be applied to binary problems'
|
||||||
|
return prevs_hat[...,1]-prevs[...,1]
|
||||||
|
|
||||||
|
def mean_bias_binary(prevs, prevs_hat):
|
||||||
|
"""
|
||||||
|
Computes the mean of the (positive) bias in a binary problem.
|
||||||
|
:param prevs: array-like of shape `(n_classes,)` with the true prevalence values
|
||||||
|
:param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
|
||||||
|
:return: mean binary bias
|
||||||
|
"""
|
||||||
|
return np.mean(bias_binary(prevs, prevs_hat))
|
||||||
|
|
||||||
def mae(prevs, prevs_hat):
|
def mae(prevs, prevs_hat):
|
||||||
"""Computes the mean absolute error (see :meth:`quapy.error.ae`) across the sample pairs.
|
"""Computes the mean absolute error (see :meth:`quapy.error.ae`) across the sample pairs.
|
||||||
|
@ -308,8 +330,8 @@ def __check_eps(eps=None):
|
||||||
|
|
||||||
|
|
||||||
CLASSIFICATION_ERROR = {f1e, acce}
|
CLASSIFICATION_ERROR = {f1e, acce}
|
||||||
QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld}
|
QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld, mean_bias_binary}
|
||||||
QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld}
|
QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld, bias_binary}
|
||||||
QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae}
|
QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae}
|
||||||
CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR}
|
CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR}
|
||||||
QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR}
|
QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR}
|
||||||
|
|
154
quapy/plot.py
154
quapy/plot.py
|
@ -9,6 +9,7 @@ import math
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
|
|
||||||
|
|
||||||
plt.rcParams['figure.figsize'] = [10, 6]
|
plt.rcParams['figure.figsize'] = [10, 6]
|
||||||
plt.rcParams['figure.dpi'] = 200
|
plt.rcParams['figure.dpi'] = 200
|
||||||
plt.rcParams['font.size'] = 18
|
plt.rcParams['font.size'] = 18
|
||||||
|
@ -212,13 +213,19 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
|
||||||
|
|
||||||
|
|
||||||
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||||
n_bins=20, error_name='ae', show_std=False,
|
n_bins=20,
|
||||||
|
y_error_name='ae',
|
||||||
|
x_error_name='ae',
|
||||||
|
show_std=False,
|
||||||
show_density=True,
|
show_density=True,
|
||||||
show_legend=True,
|
show_legend=True,
|
||||||
|
y_axis_title=None,
|
||||||
|
x_axis_title=None,
|
||||||
logscale=False,
|
logscale=False,
|
||||||
title=f'Quantification error as a function of distribution shift',
|
title=None,
|
||||||
vlines=None,
|
vlines=None,
|
||||||
method_order=None,
|
method_order=None,
|
||||||
|
bbox_to_anchor=(1,1),
|
||||||
savepath=None):
|
savepath=None):
|
||||||
"""
|
"""
|
||||||
Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift
|
Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift
|
||||||
|
@ -234,7 +241,10 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||||
for each experiment
|
for each experiment
|
||||||
:param tr_prevs: training prevalence of each experiment
|
:param tr_prevs: training prevalence of each experiment
|
||||||
:param n_bins: number of bins in which the y-axis is to be divided (default is 20)
|
:param n_bins: number of bins in which the y-axis is to be divided (default is 20)
|
||||||
:param error_name: a string representing the name of an error function (as defined in `quapy.error`, default is "ae")
|
:param y_error_name: a string representing the name of an error function (as defined in `quapy.error`,
|
||||||
|
default is "ae") to be used along the y-axis
|
||||||
|
:param x_error_name: a string representing the name of an error function (as defined in `quapy.error`,
|
||||||
|
default is "ae") to be used along the x-axis
|
||||||
:param show_std: whether or not to show standard deviations as color bands (default is False)
|
:param show_std: whether or not to show standard deviations as color bands (default is False)
|
||||||
:param show_density: whether or not to display the distribution of experiments for each bin (default is True)
|
:param show_density: whether or not to display the distribution of experiments for each bin (default is True)
|
||||||
:param show_density: whether or not to display the legend of the chart (default is True)
|
:param show_density: whether or not to display the legend of the chart (default is True)
|
||||||
|
@ -250,31 +260,40 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||||
fig, ax = plt.subplots()
|
fig, ax = plt.subplots()
|
||||||
ax.grid()
|
ax.grid()
|
||||||
|
|
||||||
x_error = qp.error.ae
|
if isinstance(x_error_name, str):
|
||||||
y_error = getattr(qp.error, error_name)
|
x_error = qp.error.from_name(x_error_name)
|
||||||
|
if isinstance(y_error_name, str):
|
||||||
|
y_error = qp.error.from_name(y_error_name)
|
||||||
|
|
||||||
# get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same
|
# get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same
|
||||||
# order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to
|
# order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to
|
||||||
# x_error function) and 'y' is the estim-test shift (computed as according to y_error)
|
# x_error function) and 'y' is the estim-test shift (computed as according to y_error)
|
||||||
data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
||||||
|
|
||||||
|
min_x = np.min([np.min(m_data['x']) for m_data in data.values()])
|
||||||
|
max_x = np.max([np.max(m_data['x']) for m_data in data.values()])
|
||||||
|
min_y = np.min([np.min(m_data['y']) for m_data in data.values()])
|
||||||
|
max_y = np.max([np.max(m_data['y']) for m_data in data.values()])
|
||||||
|
print(f'[{min_x}, {max_x}]<-')
|
||||||
|
|
||||||
if method_order is None:
|
if method_order is None:
|
||||||
method_order = method_names
|
method_order = method_names
|
||||||
|
|
||||||
_set_colors(ax, n_methods=len(method_order))
|
_set_colors(ax, n_methods=len(method_order))
|
||||||
|
|
||||||
bins = np.linspace(0, 1, n_bins+1)
|
bins = np.linspace(min_x-1E-8, max_x+1E-8, n_bins+1)
|
||||||
binwidth = 1 / n_bins
|
print('bins', bins)
|
||||||
min_x, max_x, min_y, max_y = None, None, None, None
|
binwidth = (max_x-min_x) / n_bins
|
||||||
|
|
||||||
npoints = np.zeros(len(bins), dtype=float)
|
npoints = np.zeros(len(bins), dtype=float)
|
||||||
for method in method_order:
|
for method in method_order:
|
||||||
tr_test_drifts = data[method]['x']
|
tr_test_drifts = data[method]['x']
|
||||||
method_drifts = data[method]['y']
|
method_drifts = data[method]['y']
|
||||||
if logscale:
|
if logscale:
|
||||||
ax.set_yscale("log")
|
ax.set_yscale("log")
|
||||||
ax.yaxis.set_major_formatter(ScalarFormatter())
|
# ax.yaxis.set_major_formatter(ScalarFormatter())
|
||||||
ax.yaxis.get_major_formatter().set_scientific(False)
|
# ax.yaxis.get_major_formatter().set_scientific(False)
|
||||||
ax.minorticks_off()
|
# ax.minorticks_off()
|
||||||
|
|
||||||
inds = np.digitize(tr_test_drifts, bins, right=True)
|
inds = np.digitize(tr_test_drifts, bins, right=True)
|
||||||
|
|
||||||
|
@ -282,7 +301,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||||
for p,ind in enumerate(range(len(bins))):
|
for p,ind in enumerate(range(len(bins))):
|
||||||
selected = inds==ind
|
selected = inds==ind
|
||||||
if selected.sum() > 0:
|
if selected.sum() > 0:
|
||||||
xs.append(ind*binwidth-binwidth/2)
|
xs.append(min_x + (ind*binwidth-binwidth/2))
|
||||||
ys.append(np.mean(method_drifts[selected]))
|
ys.append(np.mean(method_drifts[selected]))
|
||||||
ystds.append(np.std(method_drifts[selected]))
|
ystds.append(np.std(method_drifts[selected]))
|
||||||
npoints[p] += len(method_drifts[selected])
|
npoints[p] += len(method_drifts[selected])
|
||||||
|
@ -291,13 +310,6 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||||
ys = np.asarray(ys)
|
ys = np.asarray(ys)
|
||||||
ystds = np.asarray(ystds)
|
ystds = np.asarray(ystds)
|
||||||
|
|
||||||
min_x_method, max_x_method, min_y_method, max_y_method = xs.min(), xs.max(), ys.min(), ys.max()
|
|
||||||
min_x = min_x_method if min_x is None or min_x_method < min_x else min_x
|
|
||||||
max_x = max_x_method if max_x is None or max_x_method > max_x else max_x
|
|
||||||
max_y = max_y_method if max_y is None or max_y_method > max_y else max_y
|
|
||||||
min_y = min_y_method if min_y is None or min_y_method < min_y else min_y
|
|
||||||
max_y = max_y_method if max_y is None or max_y_method > max_y else max_y
|
|
||||||
|
|
||||||
ax.errorbar(xs, ys, fmt='-', marker='o', color='w', markersize=8, linewidth=4, zorder=1)
|
ax.errorbar(xs, ys, fmt='-', marker='o', color='w', markersize=8, linewidth=4, zorder=1)
|
||||||
ax.errorbar(xs, ys, fmt='-', marker='o', label=method, markersize=6, linewidth=2, zorder=2)
|
ax.errorbar(xs, ys, fmt='-', marker='o', label=method, markersize=6, linewidth=2, zorder=2)
|
||||||
|
|
||||||
|
@ -307,31 +319,40 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||||
if show_density:
|
if show_density:
|
||||||
ax2 = ax.twinx()
|
ax2 = ax.twinx()
|
||||||
densities = npoints/np.sum(npoints)
|
densities = npoints/np.sum(npoints)
|
||||||
ax2.bar([ind * binwidth-binwidth/2 for ind in range(len(bins))],
|
ax2.bar([min_x + (ind * binwidth-binwidth/2) for ind in range(len(bins))],
|
||||||
densities, alpha=0.15, color='g', width=binwidth, label='density')
|
densities, alpha=0.15, color='g', width=binwidth, label='density')
|
||||||
ax2.set_ylim(0,max(densities))
|
ax2.set_ylim(0,max(densities))
|
||||||
ax2.spines['right'].set_color('g')
|
ax2.spines['right'].set_color('g')
|
||||||
ax2.tick_params(axis='y', colors='g')
|
ax2.tick_params(axis='y', colors='g')
|
||||||
|
|
||||||
ax.set(xlabel=f'Distribution shift between training set and test sample',
|
y_axis_err_name = y_error_name.upper()
|
||||||
ylabel=f'{error_name.upper()} (true distribution, predicted distribution)',
|
if logscale:
|
||||||
title=title)
|
y_axis_err_name = f'log({y_axis_err_name})'
|
||||||
|
if y_axis_title is None:
|
||||||
|
y_axis_title=f'{y_axis_err_name} (true distribution, predicted distribution)'
|
||||||
|
if x_axis_title is None:
|
||||||
|
x_axis_title = f'PPS between training set and test sample (in terms of {x_error_name.upper()})'
|
||||||
|
ax.set(xlabel=x_axis_title, ylabel=y_axis_title, title=title)
|
||||||
box = ax.get_position()
|
box = ax.get_position()
|
||||||
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
|
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
|
||||||
if vlines:
|
if vlines:
|
||||||
for vline in vlines:
|
for vline in vlines:
|
||||||
ax.axvline(vline, 0, 1, linestyle='--', color='k')
|
ax.axvline(vline, 0, 1, linestyle='--', color='k')
|
||||||
|
|
||||||
ax.set_xlim(min_x, max_x)
|
margin = (max_x-min_x)*0.02
|
||||||
|
ax.set_xlim(min_x-margin, max_x+margin)
|
||||||
if logscale:
|
if logscale:
|
||||||
#nice scale for the logaritmic axis
|
#nice scale for the logaritmic axis
|
||||||
ax.set_ylim(0,10 ** math.ceil(math.log10(max_y)))
|
ax.set_ylim(0,10 ** math.ceil(math.log10(max_y)))
|
||||||
|
|
||||||
|
|
||||||
if show_legend:
|
if show_legend:
|
||||||
|
# fig.legend(loc='lower center',
|
||||||
|
# bbox_to_anchor=(1, 0.5),
|
||||||
|
# ncol=(len(method_names)+1)//2)
|
||||||
fig.legend(loc='lower center',
|
fig.legend(loc='lower center',
|
||||||
bbox_to_anchor=(1, 0.5),
|
bbox_to_anchor=bbox_to_anchor,
|
||||||
ncol=(len(method_names)+1)//2)
|
ncol=1)
|
||||||
|
|
||||||
_save_or_show(savepath)
|
_save_or_show(savepath)
|
||||||
|
|
||||||
|
@ -547,6 +568,7 @@ def _save_or_show(savepath):
|
||||||
plt.savefig(savepath, bbox_inches='tight')
|
plt.savefig(savepath, bbox_inches='tight')
|
||||||
else:
|
else:
|
||||||
plt.show()
|
plt.show()
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order):
|
def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order):
|
||||||
|
@ -568,3 +590,83 @@ def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error
|
||||||
method_order.append(method)
|
method_order.append(method)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def bin_signal(x, y, n_bins, binning_type='isometric'):
|
||||||
|
"""
|
||||||
|
Bins the input data `x` and computes statistical summaries of `y` within each bin.
|
||||||
|
|
||||||
|
:param x: The independent variable to be binned. Must be a 1D array of numerical values.
|
||||||
|
:type x: array-like
|
||||||
|
:param y: The dependent variable corresponding to `x`. Must be the same length as `x`.
|
||||||
|
:type y: array-like
|
||||||
|
:param n_bins: The number of bins to create.
|
||||||
|
:type n_bins: int
|
||||||
|
:param binning_type: The method to use for binning:
|
||||||
|
- `'isometric'`: Creates bins with equal width (isometric binning).
|
||||||
|
- `'isotonic'`: Creates bins containing an equal number of instances (isotonic binning).
|
||||||
|
Defaults to `'isometric'`.
|
||||||
|
:type binning_type: str
|
||||||
|
|
||||||
|
:return: A tuple containing:
|
||||||
|
- `bin_means` (numpy.ndarray): The mean of `y` values in each bin (`np.nan` for empty bins).
|
||||||
|
- `bin_stds` (numpy.ndarray): The standard deviation (sample std) of `y` values in each bin (`np.nan` for empty bins).
|
||||||
|
- `bin_centers` (numpy.ndarray): The center points of each bin.
|
||||||
|
- `bin_lengths` (numpy.ndarray): The length (width) of each bin.
|
||||||
|
- `bin_counts` (numpy.ndarray): The number of elements in each bin.
|
||||||
|
:rtype: tuple (numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)
|
||||||
|
|
||||||
|
:raises ValueError: If `binning_type` is not one of `'isometric'` or `'isotonic'`.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
- For isometric binning, bins are equally spaced along the range of `x`.
|
||||||
|
- For isotonic binning, bins are constructed to contain approximately equal numbers of elements, based on sorted `x`.
|
||||||
|
- If a bin is empty (no elements fall within its range), its mean and standard deviation will be `np.nan`.
|
||||||
|
|
||||||
|
:example:
|
||||||
|
|
||||||
|
>>> import numpy as np
|
||||||
|
>>> x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||||
|
>>> y = np.array([10, 20, 15, 25, 30, 35, 45, 50, 40])
|
||||||
|
>>> n_bins = 3
|
||||||
|
>>> bin_signal(x, y, n_bins, binning_type='isometric')
|
||||||
|
(array([15., 30., 45.]),
|
||||||
|
array([5., 5., 5.]),
|
||||||
|
array([2.33333333, 5. , 7.66666667]),
|
||||||
|
array([2.66666667, 2.66666667, 2.66666667]),
|
||||||
|
array([3, 3, 3]))
|
||||||
|
"""
|
||||||
|
x = np.asarray(x)
|
||||||
|
y = np.asarray(y)
|
||||||
|
|
||||||
|
if binning_type == 'isometric':
|
||||||
|
# all bins are equally-sized
|
||||||
|
bin_edges = np.linspace(x.min(), x.max(), n_bins + 1)
|
||||||
|
elif binning_type == 'isotonic':
|
||||||
|
# all bins contain the same number of instances
|
||||||
|
sorted_x = np.sort(x)
|
||||||
|
bin_edges = np.interp(np.linspace(0, len(x), n_bins + 1), np.arange(len(x)), sorted_x)
|
||||||
|
else:
|
||||||
|
raise ValueError("valid binning types include 'isometric' and 'isotonic'")
|
||||||
|
|
||||||
|
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
|
||||||
|
bin_lengths = bin_edges[1:] - bin_edges[:-1]
|
||||||
|
|
||||||
|
bin_means = []
|
||||||
|
bin_stds = []
|
||||||
|
bin_counts = []
|
||||||
|
|
||||||
|
for start, end in zip(bin_edges[:-1], bin_edges[1:]):
|
||||||
|
mask = (x >= start) & (x < end) if end != bin_edges[-1] else (x >= start) & (x <= end)
|
||||||
|
count = mask.sum()
|
||||||
|
if count > 0:
|
||||||
|
y_in_bin = y[mask]
|
||||||
|
bin_means.append(np.mean(y_in_bin))
|
||||||
|
bin_stds.append(np.std(y_in_bin, ddof=1))
|
||||||
|
else:
|
||||||
|
bin_means.append(np.nan)
|
||||||
|
bin_stds.append(np.nan)
|
||||||
|
bin_counts.append(count)
|
||||||
|
|
||||||
|
return np.array(bin_means), np.array(bin_stds), np.array(bin_centers), np.array(bin_lengths), np.array(bin_counts)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue