first commit
This commit is contained in:
commit
3470a130b9
|
@ -0,0 +1,180 @@
|
|||
import pickle
|
||||
from collections import defaultdict
|
||||
from crypt import methods
|
||||
import warnings
|
||||
import quapy as qp
|
||||
import numpy as np
|
||||
from numpy.ma.core import shape
|
||||
from quapy.method.aggregative import ExpectationMaximizationQuantifier, DistributionMatchingY, KDEyML
|
||||
from quapy.protocol import UPP
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import quapy.functional as F
|
||||
|
||||
warnings.filterwarnings('ignore', category=ConvergenceWarning)
|
||||
|
||||
|
||||
def calibration_error(y, posteriors, nbins=10, isometric=True):
|
||||
if not isometric:
|
||||
raise NotImplementedError('only isometric=True is supported at the moment')
|
||||
|
||||
nclasses = posteriors.shape[1]
|
||||
if nclasses==2:
|
||||
mean_error = _calibration_binary_error(y, posteriors[:, 1], nbins=nbins, isometric=isometric)
|
||||
else:
|
||||
errors = []
|
||||
for class_i in range(nclasses):
|
||||
binary_label = (y==class_i).astype(int)
|
||||
class_err = _calibration_binary_error(binary_label, posteriors[:, class_i], nbins=nbins, isometric=isometric)
|
||||
errors.append(class_err)
|
||||
mean_error = np.mean(errors)
|
||||
return mean_error
|
||||
|
||||
|
||||
def _calibration_binary_error(y, class_posteriors, nbins=10, isometric=True):
|
||||
if not isometric:
|
||||
raise NotImplementedError('only isometric=True is supperted at the moment')
|
||||
|
||||
bins = np.linspace(0., 1., nbins+1)
|
||||
bin_indices = np.digitize(class_posteriors, bins) - 1
|
||||
unique_bins = np.unique(bin_indices)
|
||||
err = 0.
|
||||
for bin_idx in unique_bins:
|
||||
sel = (bin_indices==bin_idx)
|
||||
sel_count = sum(sel)
|
||||
y_bin = y[sel]
|
||||
post_bin = class_posteriors[sel]
|
||||
expected_positives = np.mean(post_bin)
|
||||
true_positives = np.mean(y_bin)
|
||||
err += sel_count * (expected_positives-true_positives)**2
|
||||
err /= len(y)
|
||||
return err
|
||||
|
||||
|
||||
|
||||
# dataset = 'spambase' # qp.datasets.UCI_BINARY_DATASETS[6]
|
||||
# data = qp.datasets.fetch_UCIBinaryDataset(dataset)
|
||||
dataset = qp.datasets.UCI_MULTICLASS_DATASETS[5]
|
||||
print('loading', dataset)
|
||||
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||
train, test = data.train_test
|
||||
NBINS = 10
|
||||
EPSILON = 1e-8
|
||||
SAMPLE_SIZE = 100
|
||||
qp.environ['SAMPLE_SIZE']=SAMPLE_SIZE
|
||||
|
||||
print(f'test prevalence (orig): {F.strprev(test.prevalence())}')
|
||||
|
||||
drift = []
|
||||
results = defaultdict(lambda :[])
|
||||
|
||||
with qp.util.temp_seed(0):
|
||||
lr = LogisticRegression()
|
||||
lr.fit(*train.Xy)
|
||||
|
||||
lr_kfcv = CalibratedClassifierCV(LogisticRegression())
|
||||
lr_kfcv.fit(*train.Xy)
|
||||
|
||||
emq = ExpectationMaximizationQuantifier(LogisticRegression())
|
||||
emq.fit(train)
|
||||
#
|
||||
# dm = DistributionMatchingY(nbins=nbins)
|
||||
# dm.fit(train)
|
||||
#
|
||||
kdey = KDEyML(LogisticRegression(), bandwidth=0.01)
|
||||
# kdey.fit(train)
|
||||
devel, val = train.split_stratified(0.6, random_state=0)
|
||||
bandwidths = np.linspace(0.001, 0.2, 40)
|
||||
kdey = qp.model_selection.GridSearchQ(
|
||||
kdey, param_grid={'bandwidth': bandwidths}, protocol=UPP(val),
|
||||
refit=True, n_jobs=-1, verbose=False
|
||||
).fit(train)
|
||||
print('best params', kdey.best_params_)
|
||||
kdey = kdey.best_model_
|
||||
|
||||
|
||||
# kdes = []
|
||||
# for bandwidth in np.linspace(0.01, 0.2, 5):
|
||||
# kdei = KDEyML(bandwidth=bandwidth)
|
||||
# kdei.fit(train)
|
||||
# kdes.append(kdei)
|
||||
|
||||
|
||||
prot = qp.protocol.UPP(test, repeats=100, random_state=0, return_type='labelled_collection')
|
||||
for test_i in prot():
|
||||
print(f'test prevalence (shifted): {F.strprev(test_i.prevalence())}')
|
||||
|
||||
drift_i = qp.error.ae(test_i.prevalence(), train.prevalence())
|
||||
print(f'drift={drift_i:.4f}')
|
||||
drift.append(drift_i)
|
||||
|
||||
#true labels
|
||||
y = test_i.y
|
||||
|
||||
# uncalibrated LR
|
||||
posteriors = lr.predict_proba(test_i.X)
|
||||
err = calibration_error(y, posteriors, nbins=NBINS)
|
||||
results['lr'].append(err)
|
||||
print(f'LR {err=:.5f}')
|
||||
|
||||
# calibrated LR (assuming iid)
|
||||
posteriors = lr_kfcv.predict_proba(test_i.X)
|
||||
err = calibration_error(y, posteriors, nbins=NBINS)
|
||||
results['lr-kfcv'].append(err)
|
||||
print(f'kFCV-LR {err=:.5f}')
|
||||
|
||||
posteriors = emq.predict_proba(test_i.X)
|
||||
err = calibration_error(y, posteriors, nbins=NBINS)
|
||||
results['emq'].append(err)
|
||||
print(f'EMQ-LR {err=:.5f}')
|
||||
|
||||
# estim_prev = dm.quantify(test_i.X)
|
||||
# # estim_prev = np.expand_dims(estim_prev, axis=-1)
|
||||
# hist_neg, hist_pos = dm.validation_distribution
|
||||
# # because the histograms were computed wrt the posterior of the first class (the negative one!), we invert the order
|
||||
# # which is equivalent to computing the histogram wrt the positive class
|
||||
# hist_neg = hist_neg.flatten()[::-1]
|
||||
# hist_pos = hist_pos.flatten()[::-1]
|
||||
# hist_neg = hist_neg * estim_prev[0] + eps
|
||||
# hist_pos = hist_pos * estim_prev[1] + eps
|
||||
# corrected_posteriors_bins = hist_pos / (hist_neg + hist_pos)
|
||||
# # corrected_posteriors_bins = 1 - corrected_posteriors_bins # because the histograms were computed wrt the posterior of the first class (the negative one!)
|
||||
# corrected_posteriors_bins = np.concatenate(([0.], corrected_posteriors_bins, [1.]))
|
||||
# x_coords = np.concatenate(([0.], (np.linspace(0., 1., nbins+1)[:-1]+0.5/nbins), [1.])) # this assumes binning=isometric
|
||||
# uncalibrated_posteriors_pos = dm.classifier.predict_proba(test_i.X)[:,1]
|
||||
# posteriors = np.interp(uncalibrated_posteriors_pos, x_coords, corrected_posteriors_bins)
|
||||
# posteriors = np.asarray([1-posteriors, posteriors]).T
|
||||
# err = calibration_binary_error(y, posteriors, nbins=nbins)
|
||||
# results['dm'].append(err)
|
||||
# print(f'DM-LR {err=:.5f}')
|
||||
|
||||
estim_prev = kdey.quantify(test_i.X)
|
||||
class_densities = kdey.mix_densities
|
||||
uncalibrated_posteriors = kdey.classifier.predict_proba(test_i.X)
|
||||
test_densities = np.asarray([kdey.pdf(kde_i, posteriors)*prior_i for kde_i, prior_i in zip(class_densities, estim_prev)]).T
|
||||
test_densities += EPSILON
|
||||
posteriors = test_densities / test_densities.sum(axis=1, keepdims=True)
|
||||
err = calibration_error(y, posteriors, nbins=NBINS)
|
||||
results[f'kde'].append(err)
|
||||
print(f'KDEy-LR {err=:.5f}')
|
||||
|
||||
print()
|
||||
|
||||
with open('./results.pkl', 'wb') as foo:
|
||||
pickle.dump((drift,dict(results)), foo, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
for method, errors in results.items():
|
||||
print(f'{method=} got {np.mean(errors):.5f}')
|
||||
|
||||
all_methods = list(results.keys())
|
||||
all_results = []
|
||||
for method in all_methods:
|
||||
all_results.append(results[method])
|
||||
all_results = np.asarray(all_results)
|
||||
|
||||
print()
|
||||
from scipy.stats import rankdata
|
||||
ranks = np.apply_along_axis(rankdata, axis=0, arr=all_results, method='ordinal')
|
||||
for method, ranks_i in zip(all_methods, ranks):
|
||||
print(f'{method=} got rank {np.mean(ranks_i):.5f}')
|
|
@ -0,0 +1,55 @@
|
|||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pickle
|
||||
|
||||
with open('./results.pkl', 'rb') as fin:
|
||||
drift,results = pickle.load(fin)
|
||||
|
||||
x_axis = np.asarray(drift)
|
||||
results = {k:np.asarray(v) for k, v in results.items()}
|
||||
import pandas as pd
|
||||
|
||||
# Crear los bins y asignar cada x a un bin
|
||||
num_bins = 5
|
||||
bins = np.linspace(x_axis.min(), x_axis.max(), num_bins + 1)
|
||||
bin_labels = np.digitize(x_axis, bins) - 1 # Asignar cada x al bin correspondiente (restamos 1 para indexar desde 0)
|
||||
|
||||
# Crear la figura
|
||||
plt.figure(figsize=(10, 6))
|
||||
bin_positions = np.arange(num_bins) # Posiciones centrales de los bins
|
||||
offset = 0.2 # Desplazamiento entre métodos
|
||||
|
||||
# Recorrer los métodos y construir boxplots para cada bin
|
||||
for i, (method, y_values) in enumerate(results.items()):
|
||||
binned_data = [[] for _ in range(num_bins)]
|
||||
|
||||
# Agrupar valores de y por los bins
|
||||
for bin_idx in range(num_bins):
|
||||
binned_data[bin_idx] = y_values[bin_labels == bin_idx]
|
||||
|
||||
# Crear un DataFrame para boxplot
|
||||
binned_df = pd.DataFrame({f"Bin {j + 1}": pd.Series(data) for j, data in enumerate(binned_data)})
|
||||
|
||||
# Dibujar los boxplots con un desplazamiento en el eje x
|
||||
positions = bin_positions + i * offset # Desplazar las posiciones
|
||||
box = plt.boxplot(
|
||||
binned_df.values,
|
||||
positions=positions,
|
||||
widths=0.15,
|
||||
patch_artist=True,
|
||||
showfliers=False,
|
||||
boxprops=dict(facecolor=plt.cm.Set1(i / len(results)), alpha=0.7),
|
||||
medianprops=dict(color="black"),
|
||||
)
|
||||
|
||||
# Añadir el método a la leyenda con un marcador
|
||||
plt.plot([], [], label=method, color=plt.cm.Set1(i / len(results)))
|
||||
|
||||
# Configurar la gráfica
|
||||
plt.xticks(ticks=bin_positions, labels=[f"Bin {i + 1}" for i in range(num_bins)])
|
||||
plt.xlabel("Bins")
|
||||
plt.ylabel("Values")
|
||||
plt.legend(title="Methods")
|
||||
plt.title("Boxplot by Bins for Different Methods")
|
||||
plt.tight_layout()
|
||||
plt.show()
|
Loading…
Reference in New Issue