diff --git a/BayesianKDEy/bayesian_kdey.py b/BayesianKDEy/bayesian_kdey.py new file mode 100644 index 0000000..228f547 --- /dev/null +++ b/BayesianKDEy/bayesian_kdey.py @@ -0,0 +1,99 @@ +from sklearn.linear_model import LogisticRegression +import quapy as qp +from BayesianKDEy.plot_simplex import plot_prev_points, plot_prev_points_matplot +from quapy.functional import strprev +from quapy.method.aggregative import KDEyML +from quapy.protocol import UPP +import quapy.functional as F +import numpy as np +from tqdm import tqdm + + +def bayesian(kdes, data, prob_classifier, init=None, MAX_ITER=100_000, warmup=3_000): + """ + Bayes: + P(prev|data) = P(data|prev) P(prev) / P(data) + i.e., + posterior = likelihood * prior / evidence + we assume the likelihood be: + prev @ [kde_i_likelihood(data) 1..i..n] + prior be uniform in simplex + """ + def pdf(kde, X): + # todo: remove exp, since we are then doing the log every time? (not sure) + return np.exp(kde.score_samples(X)) + + X = prob_classifier.predict_proba(data) + test_densities = [pdf(kde_i, X) for kde_i in kdes] + + def log_likelihood(prev, epsilon=1e-8): + # test_likelihoods = sum(prev_i*dens_i for prev_i, dens_i in zip (prev, test_densities)) + test_likelihoods = prev @ test_densities + test_loglikelihood = np.log(test_likelihoods + epsilon) + return np.sum(test_loglikelihood) + + def log_prior(prev): + # todo: adapt to arbitrary prior knowledge (e.g., something around training prevalence) + return 1 # it is not 1 but we assume uniform, son anyway is an useless constant + + def sample_neighbour(prev): + rand_prev = F.uniform_prevalence_sampling(n_classes=len(prev), size=1) + rand_direction = rand_prev - prev + neighbour = F.normalize_prevalence(prev + rand_direction*0.05) + return neighbour + + n_classes = len(prob_classifier.classes_) + current_prev = F.uniform_prevalence(n_classes) if init is None else init + current_likelihood = log_likelihood(current_prev) + log_prior(current_prev) + + # Metropolis-Hastings + samples = [] + for _ in tqdm(range(MAX_ITER), total=MAX_ITER): + proposed_prev = sample_neighbour(current_prev) + + # probability of acceptance + proposed_likelihood = log_likelihood(proposed_prev) + log_prior(proposed_prev) + acceptance = proposed_likelihood - current_likelihood + + # decide acceptance + if np.log(np.random.rand()) < acceptance: + # accept + current_prev = proposed_prev + current_likelihood = proposed_likelihood + + samples.append(current_prev) + + # remove "warmup" initial iterations + samples = np.asarray(samples[warmup:]) + return samples + + +if __name__ == '__main__': + qp.environ["SAMPLE_SIZE"] = 500 + cls = LogisticRegression() + kdey = KDEyML(cls) + + train, test = qp.datasets.fetch_UCIMulticlassDataset('waveform-v1', standardize=True).train_test + + with qp.util.temp_seed(0): + print('fitting KDEy') + kdey.fit(*train.Xy) + + shifted = test.sampling(500, *[0.1, 0.1, 0.8]) + prev_hat = kdey.predict(shifted.X) + mae = qp.error.mae(shifted.prevalence(), prev_hat) + print(f'true_prev={strprev(shifted.prevalence())}, prev_hat={strprev(prev_hat)}, {mae=:.4f}') + + kdes = kdey.mix_densities + h = kdey.classifier + samples = bayesian(kdes, shifted.X, h, init=prev_hat, MAX_ITER=100_000, warmup=0) + + print(f'mean posterior {strprev(samples.mean(axis=0))}') + + plot_prev_points(samples, true_prev=shifted.prevalence()) + + + # report = qp.evaluation.evaluation_report(kdey, protocol=UPP(test), verbose=True) + # print(report.mean(numeric_only=True)) + + diff --git a/BayesianKDEy/plot_simplex.py b/BayesianKDEy/plot_simplex.py new file mode 100644 index 0000000..ca94ae1 --- /dev/null +++ b/BayesianKDEy/plot_simplex.py @@ -0,0 +1,94 @@ +import numpy as np +import matplotlib.pyplot as plt +from scipy.stats import gaussian_kde + + +def plot_prev_points(prevs, true_prev): + + def cartesian(p): + dim = p.shape[-1] + p = p.reshape(-1,dim) + x = p[:, 1] + p[:, 2] * 0.5 + y = p[:, 2] * np.sqrt(3) / 2 + return x, y + + # simplex coordinates + v1 = np.array([0, 0]) + v2 = np.array([1, 0]) + v3 = np.array([0.5, np.sqrt(3)/2]) + + # transform (a,b,c) -> Cartesian coordinates + x, y = cartesian(prevs) + + # Plot + fig, ax = plt.subplots(figsize=(6, 6)) + ax.scatter(x, y, s=50, alpha=0.05, edgecolors='none') + ax.scatter(*cartesian(true_prev), s=5, alpha=1) + + # edges + triangle = np.array([v1, v2, v3, v1]) + ax.plot(triangle[:, 0], triangle[:, 1], color='black') + + # vertex labels + ax.text(-0.05, -0.05, "y=0", ha='right', va='top') + ax.text(1.05, -0.05, "y=1", ha='left', va='top') + ax.text(0.5, np.sqrt(3)/2 + 0.05, "y=2", ha='center', va='bottom') + + ax.set_aspect('equal') + ax.axis('off') + plt.show() + + +def plot_prev_points_matplot(points): + + # project 2D + v1 = np.array([0, 0]) + v2 = np.array([1, 0]) + v3 = np.array([0.5, np.sqrt(3) / 2]) + x = points[:, 1] + points[:, 2] * 0.5 + y = points[:, 2] * np.sqrt(3) / 2 + + # kde + xy = np.vstack([x, y]) + kde = gaussian_kde(xy) + xmin, xmax = 0, 1 + ymin, ymax = 0, np.sqrt(3) / 2 + + # grid + xx, yy = np.mgrid[xmin:xmax:200j, ymin:ymax:200j] + positions = np.vstack([xx.ravel(), yy.ravel()]) + zz = np.reshape(kde(positions).T, xx.shape) + + # mask points in simplex + def in_triangle(x, y): + return (y >= 0) & (y <= np.sqrt(3) * np.minimum(x, 1 - x)) + + mask = in_triangle(xx, yy) + zz_masked = np.ma.array(zz, mask=~mask) + + # plot + fig, ax = plt.subplots(figsize=(6, 6)) + ax.imshow( + np.rot90(zz_masked), + cmap=plt.cm.viridis, + extent=[xmin, xmax, ymin, ymax], + alpha=0.8, + ) + + # Bordes del triángulo + triangle = np.array([v1, v2, v3, v1]) + ax.plot(triangle[:, 0], triangle[:, 1], color='black', lw=2) + + # Puntos (opcional) + ax.scatter(x, y, s=5, c='white', alpha=0.3) + + # Etiquetas + ax.text(-0.05, -0.05, "A (1,0,0)", ha='right', va='top') + ax.text(1.05, -0.05, "B (0,1,0)", ha='left', va='top') + ax.text(0.5, np.sqrt(3) / 2 + 0.05, "C (0,0,1)", ha='center', va='bottom') + + ax.set_aspect('equal') + ax.axis('off') + plt.show() + +