first commit
This commit is contained in:
commit
3470a130b9
|
@ -0,0 +1,180 @@
|
||||||
|
import pickle
|
||||||
|
from collections import defaultdict
|
||||||
|
from crypt import methods
|
||||||
|
import warnings
|
||||||
|
import quapy as qp
|
||||||
|
import numpy as np
|
||||||
|
from numpy.ma.core import shape
|
||||||
|
from quapy.method.aggregative import ExpectationMaximizationQuantifier, DistributionMatchingY, KDEyML
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import quapy.functional as F
|
||||||
|
|
||||||
|
warnings.filterwarnings('ignore', category=ConvergenceWarning)
|
||||||
|
|
||||||
|
|
||||||
|
def calibration_error(y, posteriors, nbins=10, isometric=True):
|
||||||
|
if not isometric:
|
||||||
|
raise NotImplementedError('only isometric=True is supported at the moment')
|
||||||
|
|
||||||
|
nclasses = posteriors.shape[1]
|
||||||
|
if nclasses==2:
|
||||||
|
mean_error = _calibration_binary_error(y, posteriors[:, 1], nbins=nbins, isometric=isometric)
|
||||||
|
else:
|
||||||
|
errors = []
|
||||||
|
for class_i in range(nclasses):
|
||||||
|
binary_label = (y==class_i).astype(int)
|
||||||
|
class_err = _calibration_binary_error(binary_label, posteriors[:, class_i], nbins=nbins, isometric=isometric)
|
||||||
|
errors.append(class_err)
|
||||||
|
mean_error = np.mean(errors)
|
||||||
|
return mean_error
|
||||||
|
|
||||||
|
|
||||||
|
def _calibration_binary_error(y, class_posteriors, nbins=10, isometric=True):
|
||||||
|
if not isometric:
|
||||||
|
raise NotImplementedError('only isometric=True is supperted at the moment')
|
||||||
|
|
||||||
|
bins = np.linspace(0., 1., nbins+1)
|
||||||
|
bin_indices = np.digitize(class_posteriors, bins) - 1
|
||||||
|
unique_bins = np.unique(bin_indices)
|
||||||
|
err = 0.
|
||||||
|
for bin_idx in unique_bins:
|
||||||
|
sel = (bin_indices==bin_idx)
|
||||||
|
sel_count = sum(sel)
|
||||||
|
y_bin = y[sel]
|
||||||
|
post_bin = class_posteriors[sel]
|
||||||
|
expected_positives = np.mean(post_bin)
|
||||||
|
true_positives = np.mean(y_bin)
|
||||||
|
err += sel_count * (expected_positives-true_positives)**2
|
||||||
|
err /= len(y)
|
||||||
|
return err
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# dataset = 'spambase' # qp.datasets.UCI_BINARY_DATASETS[6]
|
||||||
|
# data = qp.datasets.fetch_UCIBinaryDataset(dataset)
|
||||||
|
dataset = qp.datasets.UCI_MULTICLASS_DATASETS[5]
|
||||||
|
print('loading', dataset)
|
||||||
|
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||||
|
train, test = data.train_test
|
||||||
|
NBINS = 10
|
||||||
|
EPSILON = 1e-8
|
||||||
|
SAMPLE_SIZE = 100
|
||||||
|
qp.environ['SAMPLE_SIZE']=SAMPLE_SIZE
|
||||||
|
|
||||||
|
print(f'test prevalence (orig): {F.strprev(test.prevalence())}')
|
||||||
|
|
||||||
|
drift = []
|
||||||
|
results = defaultdict(lambda :[])
|
||||||
|
|
||||||
|
with qp.util.temp_seed(0):
|
||||||
|
lr = LogisticRegression()
|
||||||
|
lr.fit(*train.Xy)
|
||||||
|
|
||||||
|
lr_kfcv = CalibratedClassifierCV(LogisticRegression())
|
||||||
|
lr_kfcv.fit(*train.Xy)
|
||||||
|
|
||||||
|
emq = ExpectationMaximizationQuantifier(LogisticRegression())
|
||||||
|
emq.fit(train)
|
||||||
|
#
|
||||||
|
# dm = DistributionMatchingY(nbins=nbins)
|
||||||
|
# dm.fit(train)
|
||||||
|
#
|
||||||
|
kdey = KDEyML(LogisticRegression(), bandwidth=0.01)
|
||||||
|
# kdey.fit(train)
|
||||||
|
devel, val = train.split_stratified(0.6, random_state=0)
|
||||||
|
bandwidths = np.linspace(0.001, 0.2, 40)
|
||||||
|
kdey = qp.model_selection.GridSearchQ(
|
||||||
|
kdey, param_grid={'bandwidth': bandwidths}, protocol=UPP(val),
|
||||||
|
refit=True, n_jobs=-1, verbose=False
|
||||||
|
).fit(train)
|
||||||
|
print('best params', kdey.best_params_)
|
||||||
|
kdey = kdey.best_model_
|
||||||
|
|
||||||
|
|
||||||
|
# kdes = []
|
||||||
|
# for bandwidth in np.linspace(0.01, 0.2, 5):
|
||||||
|
# kdei = KDEyML(bandwidth=bandwidth)
|
||||||
|
# kdei.fit(train)
|
||||||
|
# kdes.append(kdei)
|
||||||
|
|
||||||
|
|
||||||
|
prot = qp.protocol.UPP(test, repeats=100, random_state=0, return_type='labelled_collection')
|
||||||
|
for test_i in prot():
|
||||||
|
print(f'test prevalence (shifted): {F.strprev(test_i.prevalence())}')
|
||||||
|
|
||||||
|
drift_i = qp.error.ae(test_i.prevalence(), train.prevalence())
|
||||||
|
print(f'drift={drift_i:.4f}')
|
||||||
|
drift.append(drift_i)
|
||||||
|
|
||||||
|
#true labels
|
||||||
|
y = test_i.y
|
||||||
|
|
||||||
|
# uncalibrated LR
|
||||||
|
posteriors = lr.predict_proba(test_i.X)
|
||||||
|
err = calibration_error(y, posteriors, nbins=NBINS)
|
||||||
|
results['lr'].append(err)
|
||||||
|
print(f'LR {err=:.5f}')
|
||||||
|
|
||||||
|
# calibrated LR (assuming iid)
|
||||||
|
posteriors = lr_kfcv.predict_proba(test_i.X)
|
||||||
|
err = calibration_error(y, posteriors, nbins=NBINS)
|
||||||
|
results['lr-kfcv'].append(err)
|
||||||
|
print(f'kFCV-LR {err=:.5f}')
|
||||||
|
|
||||||
|
posteriors = emq.predict_proba(test_i.X)
|
||||||
|
err = calibration_error(y, posteriors, nbins=NBINS)
|
||||||
|
results['emq'].append(err)
|
||||||
|
print(f'EMQ-LR {err=:.5f}')
|
||||||
|
|
||||||
|
# estim_prev = dm.quantify(test_i.X)
|
||||||
|
# # estim_prev = np.expand_dims(estim_prev, axis=-1)
|
||||||
|
# hist_neg, hist_pos = dm.validation_distribution
|
||||||
|
# # because the histograms were computed wrt the posterior of the first class (the negative one!), we invert the order
|
||||||
|
# # which is equivalent to computing the histogram wrt the positive class
|
||||||
|
# hist_neg = hist_neg.flatten()[::-1]
|
||||||
|
# hist_pos = hist_pos.flatten()[::-1]
|
||||||
|
# hist_neg = hist_neg * estim_prev[0] + eps
|
||||||
|
# hist_pos = hist_pos * estim_prev[1] + eps
|
||||||
|
# corrected_posteriors_bins = hist_pos / (hist_neg + hist_pos)
|
||||||
|
# # corrected_posteriors_bins = 1 - corrected_posteriors_bins # because the histograms were computed wrt the posterior of the first class (the negative one!)
|
||||||
|
# corrected_posteriors_bins = np.concatenate(([0.], corrected_posteriors_bins, [1.]))
|
||||||
|
# x_coords = np.concatenate(([0.], (np.linspace(0., 1., nbins+1)[:-1]+0.5/nbins), [1.])) # this assumes binning=isometric
|
||||||
|
# uncalibrated_posteriors_pos = dm.classifier.predict_proba(test_i.X)[:,1]
|
||||||
|
# posteriors = np.interp(uncalibrated_posteriors_pos, x_coords, corrected_posteriors_bins)
|
||||||
|
# posteriors = np.asarray([1-posteriors, posteriors]).T
|
||||||
|
# err = calibration_binary_error(y, posteriors, nbins=nbins)
|
||||||
|
# results['dm'].append(err)
|
||||||
|
# print(f'DM-LR {err=:.5f}')
|
||||||
|
|
||||||
|
estim_prev = kdey.quantify(test_i.X)
|
||||||
|
class_densities = kdey.mix_densities
|
||||||
|
uncalibrated_posteriors = kdey.classifier.predict_proba(test_i.X)
|
||||||
|
test_densities = np.asarray([kdey.pdf(kde_i, posteriors)*prior_i for kde_i, prior_i in zip(class_densities, estim_prev)]).T
|
||||||
|
test_densities += EPSILON
|
||||||
|
posteriors = test_densities / test_densities.sum(axis=1, keepdims=True)
|
||||||
|
err = calibration_error(y, posteriors, nbins=NBINS)
|
||||||
|
results[f'kde'].append(err)
|
||||||
|
print(f'KDEy-LR {err=:.5f}')
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
with open('./results.pkl', 'wb') as foo:
|
||||||
|
pickle.dump((drift,dict(results)), foo, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
for method, errors in results.items():
|
||||||
|
print(f'{method=} got {np.mean(errors):.5f}')
|
||||||
|
|
||||||
|
all_methods = list(results.keys())
|
||||||
|
all_results = []
|
||||||
|
for method in all_methods:
|
||||||
|
all_results.append(results[method])
|
||||||
|
all_results = np.asarray(all_results)
|
||||||
|
|
||||||
|
print()
|
||||||
|
from scipy.stats import rankdata
|
||||||
|
ranks = np.apply_along_axis(rankdata, axis=0, arr=all_results, method='ordinal')
|
||||||
|
for method, ranks_i in zip(all_methods, ranks):
|
||||||
|
print(f'{method=} got rank {np.mean(ranks_i):.5f}')
|
|
@ -0,0 +1,55 @@
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
with open('./results.pkl', 'rb') as fin:
|
||||||
|
drift,results = pickle.load(fin)
|
||||||
|
|
||||||
|
x_axis = np.asarray(drift)
|
||||||
|
results = {k:np.asarray(v) for k, v in results.items()}
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Crear los bins y asignar cada x a un bin
|
||||||
|
num_bins = 5
|
||||||
|
bins = np.linspace(x_axis.min(), x_axis.max(), num_bins + 1)
|
||||||
|
bin_labels = np.digitize(x_axis, bins) - 1 # Asignar cada x al bin correspondiente (restamos 1 para indexar desde 0)
|
||||||
|
|
||||||
|
# Crear la figura
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
bin_positions = np.arange(num_bins) # Posiciones centrales de los bins
|
||||||
|
offset = 0.2 # Desplazamiento entre métodos
|
||||||
|
|
||||||
|
# Recorrer los métodos y construir boxplots para cada bin
|
||||||
|
for i, (method, y_values) in enumerate(results.items()):
|
||||||
|
binned_data = [[] for _ in range(num_bins)]
|
||||||
|
|
||||||
|
# Agrupar valores de y por los bins
|
||||||
|
for bin_idx in range(num_bins):
|
||||||
|
binned_data[bin_idx] = y_values[bin_labels == bin_idx]
|
||||||
|
|
||||||
|
# Crear un DataFrame para boxplot
|
||||||
|
binned_df = pd.DataFrame({f"Bin {j + 1}": pd.Series(data) for j, data in enumerate(binned_data)})
|
||||||
|
|
||||||
|
# Dibujar los boxplots con un desplazamiento en el eje x
|
||||||
|
positions = bin_positions + i * offset # Desplazar las posiciones
|
||||||
|
box = plt.boxplot(
|
||||||
|
binned_df.values,
|
||||||
|
positions=positions,
|
||||||
|
widths=0.15,
|
||||||
|
patch_artist=True,
|
||||||
|
showfliers=False,
|
||||||
|
boxprops=dict(facecolor=plt.cm.Set1(i / len(results)), alpha=0.7),
|
||||||
|
medianprops=dict(color="black"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Añadir el método a la leyenda con un marcador
|
||||||
|
plt.plot([], [], label=method, color=plt.cm.Set1(i / len(results)))
|
||||||
|
|
||||||
|
# Configurar la gráfica
|
||||||
|
plt.xticks(ticks=bin_positions, labels=[f"Bin {i + 1}" for i in range(num_bins)])
|
||||||
|
plt.xlabel("Bins")
|
||||||
|
plt.ylabel("Values")
|
||||||
|
plt.legend(title="Methods")
|
||||||
|
plt.title("Boxplot by Bins for Different Methods")
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.show()
|
Loading…
Reference in New Issue