Add example for Bayesian quantification.

2024-03-15 16:52:19 +01:00 · 2024-03-15 16:52:19 +01:00 · 020530e14f
parent 25baae643b
commit 020530e14f
1 changed files with 189 additions and 0 deletions
--- a/examples/bayesian_quantification.py
+++ b/examples/bayesian_quantification.py
@ -0,0 +1,189 @@
 """
 This example shows how to use Bayesian quantification (https://arxiv.org/abs/2302.09159),
 which is suitable for low-data situations and when the uncertainty of the prevalence estimate is of interest.
 For this, we will need to install extra dependencies:
 ```
 $ pip install quapy[bayesian]
 ```
 Running the script via:
 ```
 $ python examples/bayesian_quantification.py
 ```
 will produce a plot `bayesian_quantification.pdf`.
 Due to a low sample size and the fact that classes 2 and 3 are hard to distinguish,
 it is hard to estimate the proportions accurately, what is visible by looking at the posterior samples,
 showing large uncertainty.
 """
 from dataclasses import dataclass
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.ensemble import RandomForestClassifier
 from quapy.method.aggregative import BayesianCC, ACC, PACC
 from quapy.data import LabelledCollection
 FIGURE_PATH = "bayesian_quantification.pdf"
@dataclass
 class SimulatedData:
    n_classes: int
    X_train: np.ndarray
    Y_train: np.ndarray
    X_test: np.ndarray
    Y_test: np.ndarray
 def simulate_data(rng) -> SimulatedData:
    """Generates a simulated data set with three classes."""
    cov = np.eye(2)
    n_train = [400, 400, 400]
    n_test = [40, 25, 15]
    mus = [np.zeros(2), np.array([1, 1.5]), np.array([1.5, 1])]
    X_train = np.concatenate([
        rng.multivariate_normal(mus[i], cov, size=n_train[i])
        for i in range(3)
    ])
    X_test = np.concatenate([
        rng.multivariate_normal(mus[i], cov, size=n_test[i])
        for i in range(3)
    ])
    Y_train = np.concatenate([[i] * n for i, n in enumerate(n_train)])
    Y_test = np.concatenate([[i] * n for i, n in enumerate(n_test)])
    return SimulatedData(
        n_classes=3,
        X_train=X_train,
        X_test=X_test,
        Y_train=Y_train,
        Y_test=Y_test,
    )
 def plot_simulated_data(axs, data: SimulatedData) -> None:
    """Plots a simulated data set.
    Args:
        axs: a list of three `plt.Axes` objects, on which the samples will be plotted.
        data: the simulated data set.
    """
    xlim = (
        -0.3 + min(data.X_train[:, 0].min(), data.X_test[:, 0].min()),
        0.3 + max(data.X_train[:, 0].max(), data.X_test[:, 0].max())
    )
    ylim = (
        -0.3 + min(data.X_train[:, 1].min(), data.X_test[:, 1].min()),
        0.3 + max(data.X_train[:, 1].max(), data.X_test[:, 1].max())
    )
    for ax in axs:
        ax.set_xlabel("$X_1$")
        ax.set_ylabel("$X_2$")
        ax.set_aspect("equal")
        ax.set_xlim(*xlim)
        ax.set_ylim(*ylim)
    ax = axs[0]
    ax.set_title("Training set")
    for i in range(data.n_classes):
        ax.scatter(data.X_train[data.Y_train == i, 0], data.X_train[data.Y_train == i, 1], c=f"C{i}", s=3, rasterized=True)
    ax = axs[1]
    ax.set_title("Test set\n(with labels)")
    for i in range(data.n_classes):
        ax.scatter(data.X_test[data.Y_test == i, 0], data.X_test[data.Y_test == i, 1], c=f"C{i}", s=3, rasterized=True)
    ax = axs[2]
    ax.set_title("Test set\n(as observed)")
    ax.scatter(data.X_test[:, 0], data.X_test[:, 1], c="C5", s=3, rasterized=True)
 def get_random_forest() -> RandomForestClassifier:
    return RandomForestClassifier(n_estimators=10, random_state=5)    
 def train_and_plot_bayesian_quantification(ax: plt.Axes, training: LabelledCollection, test: np.ndarray, n_classes: int) -> None:
    quantifier = BayesianCC(classifier=get_random_forest())
    quantifier.fit(training)
    # Obtain mean prediction
    mean_prediction = quantifier.quantify(test)
    x_ax = np.arange(n_classes)
    ax.plot(x_ax, mean_prediction, c="salmon", linewidth=2, linestyle=":", label="Bayesian")
    # Obtain individual samples 
    samples = quantifier.get_prevalence_samples()
    for sample in samples[::5, :]:
        ax.plot(x_ax, sample, c="salmon", alpha=0.1, linewidth=0.3, rasterized=True)
 def _get_estimate(estimator_class, training: LabelledCollection, test: np.ndarray) -> None:
    estimator = estimator_class(get_random_forest())
    estimator.fit(training)
    return estimator.quantify(test)
 def train_and_plot_acc(ax: plt.Axes, training: LabelledCollection, test: np.ndarray, n_classes: int) -> None:
    estimate = _get_estimate(ACC, training, test)
    ax.plot(np.arange(n_classes), estimate, c="darkblue", linewidth=2, linestyle=":", label="ACC")
 def train_and_plot_pacc(ax: plt.Axes, training: LabelledCollection, test: np.ndarray, n_classes: int) -> None:
    estimate = _get_estimate(PACC, training, test)
    ax.plot(np.arange(n_classes), estimate, c="limegreen", linewidth=2, linestyle=":", label="PACC")
 def plot_true_proportions(ax: plt.Axes, test_labels: np.ndarray, n_classes: int) -> None:
    counts = np.bincount(test_labels, minlength=n_classes)
    proportion = counts / counts.sum()
    x_ax = np.arange(n_classes)
    ax.plot(x_ax, proportion, c="black", linewidth=2, label="True")
    ax.set_xlabel("Class")
    ax.set_ylabel("Prevalence")
    ax.set_xticks(x_ax, x_ax + 1)
    ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0])
    ax.set_xlim(-0.1, n_classes - 0.9)
    ax.set_ylim(-0.01, 1.01)
 def main() -> None:
    # --- Simulate data ---
    rng = np.random.default_rng(42)
    data = simulate_data(rng)
    # --- Plot simulated data ---
    fig, axs = plt.subplots(1, 4, figsize=(13, 3), dpi=300)
    for ax in axs:
        ax.spines[['top', 'right']].set_visible(False)
    plot_simulated_data(axs[:3], data)
    # --- Plot quantification results ---
    ax = axs[3]
    plot_true_proportions(ax, test_labels=data.Y_test, n_classes=data.n_classes)
    training = LabelledCollection(data.X_train, data.Y_train)
    train_and_plot_acc(ax, training=training, test=data.X_test, n_classes=data.n_classes)
    train_and_plot_pacc(ax, training=training, test=data.X_test, n_classes=data.n_classes)
    train_and_plot_bayesian_quantification(ax=ax, training=training, test=data.X_test, n_classes=data.n_classes)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', frameon=False)
    fig.tight_layout()
    fig.savefig(FIGURE_PATH)
 if __name__ == '__main__':
    main()