From 3268e9fada0af5ff2206d0df142de84b754737f2 Mon Sep 17 00:00:00 2001 From: pglez82 Date: Fri, 14 Nov 2025 18:35:40 +0100 Subject: [PATCH 1/6] PQ (precise quantifier) --- quapy/method/__init__.py | 2 + quapy/method/_bayesian.py | 52 +++++++++++++++++++ quapy/method/confidence.py | 104 ++++++++++++++++++++++++++++++++++++- quapy/method/stan/pq.stan | 38 ++++++++++++++ 4 files changed, 194 insertions(+), 2 deletions(-) create mode 100644 quapy/method/stan/pq.stan diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index 4c2ec1c..ab7a59b 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -29,6 +29,7 @@ AGGREGATIVE_METHODS = { aggregative.KDEyHD, # aggregative.OneVsAllAggregative, confidence.BayesianCC, + confidence.PQ, } BINARY_METHODS = { @@ -40,6 +41,7 @@ BINARY_METHODS = { aggregative.MAX, aggregative.MS, aggregative.MS2, + confidence.PQ, } MULTICLASS_METHODS = { diff --git a/quapy/method/_bayesian.py b/quapy/method/_bayesian.py index c783f10..c5feaf3 100644 --- a/quapy/method/_bayesian.py +++ b/quapy/method/_bayesian.py @@ -8,6 +8,7 @@ try: import jax.numpy as jnp import numpyro import numpyro.distributions as dist + import stan DEPENDENCIES_INSTALLED = True except ImportError: @@ -15,6 +16,7 @@ except ImportError: jnp = None numpyro = None dist = None + stan = None DEPENDENCIES_INSTALLED = False @@ -77,3 +79,53 @@ def sample_posterior( rng_key = jax.random.PRNGKey(seed) mcmc.run(rng_key, n_c_unlabeled=n_c_unlabeled, n_y_and_c_labeled=n_y_and_c_labeled) return mcmc.get_samples() + + + +def pq_stan(stan_code, n_bins, pos_hist, neg_hist, test_hist, number_of_samples, num_warmup, stan_seed): + """ + Perform Bayesian prevalence estimation using a Stan model for probabilistic quantification. + + This function builds and samples from a Stan model that implements a bin-based Bayesian + quantifier. It uses the class-conditional histograms of the classifier + outputs for positive and negative examples, along with the test histogram, to estimate + the posterior distribution of prevalence in the test set. + + Parameters + ---------- + stan_code : str + The Stan model code as a string. + n_bins : int + Number of bins used to build the histograms for positive and negative examples. + pos_hist : array-like of shape (n_bins,) + Histogram counts of the classifier outputs for the positive class. + neg_hist : array-like of shape (n_bins,) + Histogram counts of the classifier outputs for the negative class. + test_hist : array-like of shape (n_bins,) + Histogram counts of the classifier outputs for the test set, binned using the same bins. + number_of_samples : int + Number of post-warmup samples to draw from the Stan posterior. + num_warmup : int + Number of warmup iterations for the sampler. + stan_seed : int + Random seed for Stan model compilation and sampling, ensuring reproducibility. + + Returns + ------- + prev_samples : numpy.ndarray + An array of posterior samples of the prevalence (`prev`) in the test set. + Each element corresponds to one draw from the posterior distribution. + """ + + stan_data = { + 'n_bucket': n_bins, + 'train_neg': neg_hist.tolist(), + 'train_pos': pos_hist.tolist(), + 'test': test_hist.tolist(), + 'posterior': 1 + } + + stan_model = stan.build(stan_code, data=stan_data, random_seed=stan_seed) + fit = stan_model.sample(num_chains=1, num_samples=number_of_samples,num_warmup=num_warmup) + + return fit['prev'] diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index 07b2b1e..dd9e05f 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -5,9 +5,8 @@ from sklearn.metrics import confusion_matrix import quapy as qp import quapy.functional as F from quapy.method import _bayesian -from quapy.method.aggregative import AggregativeCrispQuantifier from quapy.data import LabelledCollection -from quapy.method.aggregative import AggregativeQuantifier +from quapy.method.aggregative import AggregativeQuantifier, AggregativeCrispQuantifier, AggregativeSoftQuantifier, BinaryAggregativeQuantifier from scipy.stats import chi2 from sklearn.utils import resample from abc import ABC, abstractmethod @@ -571,3 +570,104 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): samples = self.get_prevalence_samples() # available after calling "aggregate" function region = WithConfidenceABC.construct_region(samples, confidence_level=self.confidence_level, method=self.region) return point_estimate, region + + +class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): + """ + `Precise Quantifier: Bayesian distribution matching quantifier , + which is a variant of :class:`HDy` that calculates the posterior probability distribution + over the prevalence vectors, rather than providing a point estimate. + + This method relies on extra dependencies, which have to be installed via: + `$ pip install quapy[bayes]` + + :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be + the one indicated in `qp.environ['DEFAULT_CLS']` + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to + None when the method does not require any validation data, in order to avoid that some portion of + the training data be wasted. + :param num_warmup: number of warmup iterations for the STAN sampler (default 500) + :param num_samples: number of samples to draw from the posterior (default 1000) + :param stan_seed: random seed for the STAN sampler (default 0) + :param region: string, set to `intervals` for constructing confidence intervals (default), or to + `ellipse` for constructing an ellipse in the probability simplex, or to `ellipse-clr` for + constructing an ellipse in the Centered-Log Ratio (CLR) unconstrained space. + """ + def __init__(self, + classifier: BaseEstimator=None, + fit_classifier=True, + val_split: int = 5, + n_bins: int = 4, + fixed_bins: bool = False, + num_warmup: int = 500, + num_samples: int = 1_000, + stan_seed: int = 0, + region: str = 'intervals'): + + if num_warmup <= 0: + raise ValueError(f'parameter {num_warmup=} must be a positive integer') + if num_samples <= 0: + raise ValueError(f'parameter {num_samples=} must be a positive integer') + + if _bayesian.DEPENDENCIES_INSTALLED is False: + raise ImportError("Auxiliary dependencies are required. " + "Run `$ pip install quapy[bayes]` to install them.") + + super().__init__(classifier, fit_classifier, val_split) + self.n_bins = n_bins + self.fixed_bins = fixed_bins + self.num_warmup = num_warmup + self.num_samples = num_samples + self.region = region + self.stan_seed = stan_seed + with open('quapy/method/stan/pq.stan', 'r') as f: + self.stan_code = str(f.read()) + + def aggregation_fit(self, classif_predictions, labels): + y_pred = classif_predictions[:, self.pos_label] + + # Compute bin limits + if self.fixed_bins: + # Uniform bins in [0,1] + self.bin_limits = np.linspace(0, 1, self.n_bins + 1) + else: + # Quantile bins + self.bin_limits = np.quantile(y_pred, np.linspace(0, 1, self.n_bins + 1)) + + # Assign each prediction to a bin + bin_indices = np.digitize(y_pred, self.bin_limits[1:-1], right=True) + + # Positive and negative masks + pos_mask = (labels == self.pos_label) + neg_mask = ~pos_mask + + # Count positives and negatives per bin + self.pos_hist = np.bincount(bin_indices[pos_mask], minlength=self.n_bins) + self.neg_hist = np.bincount(bin_indices[neg_mask], minlength=self.n_bins) + + + def aggregate(self, classif_predictions): + Px_test = classif_predictions[:, self.pos_label] + test_hist, _ = np.histogram(Px_test, bins=self.bin_limits) + self.prev_distribution = _bayesian.pq_stan(self.stan_code, + self.n_bins, + self.pos_hist, + self.neg_hist, + test_hist, + self.num_samples, + self.num_warmup, + self.stan_seed) + return F.as_binary_prevalence(self.prev_distribution.mean()) + + + def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + classif_predictions = self.classify(instances) + point_estimate = self.aggregate(classif_predictions) + samples = self.prev_distribution + region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region) + return point_estimate, region + diff --git a/quapy/method/stan/pq.stan b/quapy/method/stan/pq.stan new file mode 100644 index 0000000..b759ddf --- /dev/null +++ b/quapy/method/stan/pq.stan @@ -0,0 +1,38 @@ +data { + int n_bucket; + array[n_bucket] int train_pos; + array[n_bucket] int train_neg; + array[n_bucket] int test; + int posterior; +} + +transformed data{ + row_vector[n_bucket] train_pos_rv; + row_vector[n_bucket] train_neg_rv; + row_vector[n_bucket] test_rv; + real n_test; + + train_pos_rv = to_row_vector( train_pos ); + train_neg_rv = to_row_vector( train_neg ); + test_rv = to_row_vector( test ); + n_test = sum( test ); +} + +parameters { + simplex[n_bucket] p_neg; + simplex[n_bucket] p_pos; + real prev_prior; +} + +model { + if( posterior ) { + target += train_neg_rv * log( p_neg ); + target += train_pos_rv * log( p_pos ); + target += test_rv * log( p_neg * ( 1 - prev_prior) + p_pos * prev_prior ); + } +} + +generated quantities { + real prev; + prev = sum( binomial_rng(test, 1 / ( 1 + (p_neg./p_pos) *(1-prev_prior)/prev_prior ) ) ) / n_test; +} \ No newline at end of file From e4c07e18353e6d147bb30b17c9c79edb21608a5e Mon Sep 17 00:00:00 2001 From: pglez82 Date: Sat, 15 Nov 2025 16:51:48 +0100 Subject: [PATCH 2/6] changing the way the file is loaded --- quapy/method/_bayesian.py | 4 ++++ quapy/method/confidence.py | 3 +-- setup.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/quapy/method/_bayesian.py b/quapy/method/_bayesian.py index c5feaf3..da65eed 100644 --- a/quapy/method/_bayesian.py +++ b/quapy/method/_bayesian.py @@ -2,6 +2,7 @@ Utility functions for `Bayesian quantification `_ methods. """ import numpy as np +import importlib.resources try: import jax @@ -82,6 +83,9 @@ def sample_posterior( +def load_stan_file(): + return importlib.resources.files('quapy.method').joinpath('stan/pq.stan').read_text(encoding='utf-8') + def pq_stan(stan_code, n_bins, pos_hist, neg_hist, test_hist, number_of_samples, num_warmup, stan_seed): """ Perform Bayesian prevalence estimation using a Stan model for probabilistic quantification. diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index dd9e05f..ab649c2 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -624,8 +624,7 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): self.num_samples = num_samples self.region = region self.stan_seed = stan_seed - with open('quapy/method/stan/pq.stan', 'r') as f: - self.stan_code = str(f.read()) + self.stan_code = _bayesian.load_stan_file() def aggregation_fit(self, classif_predictions, labels): y_pred = classif_predictions[:, self.pos_label] diff --git a/setup.py b/setup.py index ba5f205..c058f9d 100644 --- a/setup.py +++ b/setup.py @@ -124,7 +124,7 @@ setup( # Similar to `install_requires` above, these must be valid existing # projects. extras_require={ # Optional - 'bayes': ['jax', 'jaxlib', 'numpyro'], + 'bayes': ['jax', 'jaxlib', 'numpyro', 'pystan'], 'neural': ['torch'], 'tests': ['certifi'], 'docs' : ['sphinx-rtd-theme', 'myst-parser'], From c6492a0f205518130f81c5befd962a17ee4eb9b6 Mon Sep 17 00:00:00 2001 From: pglez82 Date: Sat, 15 Nov 2025 17:04:44 +0100 Subject: [PATCH 3/6] fixing import --- quapy/method/non_aggregative.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py index ae894fd..6f204e4 100644 --- a/quapy/method/non_aggregative.py +++ b/quapy/method/non_aggregative.py @@ -6,7 +6,7 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.utils import resample from sklearn.preprocessing import normalize -from method.confidence import WithConfidenceABC, ConfidenceRegionABC +from quapy.method.confidence import WithConfidenceABC, ConfidenceRegionABC from quapy.functional import get_divergence from quapy.method.base import BaseQuantifier, BinaryQuantifier import quapy.functional as F From 46e7246f3a4f0a8256af5f6b9b61db3f49a08e77 Mon Sep 17 00:00:00 2001 From: pglez82 Date: Sat, 15 Nov 2025 17:04:55 +0100 Subject: [PATCH 4/6] adding stan file to setup --- setup.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/setup.py b/setup.py index c058f9d..2464122 100644 --- a/setup.py +++ b/setup.py @@ -111,6 +111,12 @@ setup( # packages=find_packages(include=['quapy', 'quapy.*']), # Required + package_data={ + # For the 'quapy.method' package, include all files + # in the 'stan' subdirectory that end with .stan + 'quapy.method': ['stan/*.stan'] + }, + python_requires='>=3.8, <4', install_requires=['scikit-learn', 'pandas', 'tqdm', 'matplotlib', 'joblib', 'xlrd', 'abstention', 'ucimlrepo', 'certifi'], From d9cf6cc11d4c0f30867578f374e3695a1c8f0882 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 15 Nov 2025 17:56:37 +0100 Subject: [PATCH 5/6] index in labelled collection from versions restored --- quapy/data/base.py | 2 +- quapy/method/confidence.py | 24 ++++++++++-------------- quapy/method/stan/pq.stan | 3 ++- setup.py | 2 +- 4 files changed, 14 insertions(+), 17 deletions(-) diff --git a/quapy/data/base.py b/quapy/data/base.py index 6db182a..9bdf135 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -44,7 +44,7 @@ class LabelledCollection: @property def index(self): - if self._index is None: + if not hasattr(self, '_index') or self._index is None: self._index = {class_: np.arange(len(self))[self.labels == class_] for class_ in self.classes_} return self._index diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index dd9e05f..7423545 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -13,6 +13,7 @@ from abc import ABC, abstractmethod from scipy.special import softmax, factorial import copy from functools import lru_cache +from pathlib import Path """ This module provides implementation of different types of confidence regions, and the implementation of Bootstrap @@ -613,7 +614,7 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): if num_samples <= 0: raise ValueError(f'parameter {num_samples=} must be a positive integer') - if _bayesian.DEPENDENCIES_INSTALLED is False: + if not _bayesian.DEPENDENCIES_INSTALLED: raise ImportError("Auxiliary dependencies are required. " "Run `$ pip install quapy[bayes]` to install them.") @@ -624,7 +625,9 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): self.num_samples = num_samples self.region = region self.stan_seed = stan_seed - with open('quapy/method/stan/pq.stan', 'r') as f: + # with open('quapy/method/stan/pq.stan', 'r') as f: + stan_path = Path(__file__).resolve().parent / "stan" / "pq.stan" + with stan_path.open("r") as f: self.stan_code = str(f.read()) def aggregation_fit(self, classif_predictions, labels): @@ -649,24 +652,17 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): self.pos_hist = np.bincount(bin_indices[pos_mask], minlength=self.n_bins) self.neg_hist = np.bincount(bin_indices[neg_mask], minlength=self.n_bins) - def aggregate(self, classif_predictions): Px_test = classif_predictions[:, self.pos_label] test_hist, _ = np.histogram(Px_test, bins=self.bin_limits) - self.prev_distribution = _bayesian.pq_stan(self.stan_code, - self.n_bins, - self.pos_hist, - self.neg_hist, - test_hist, - self.num_samples, - self.num_warmup, - self.stan_seed) + self.prev_distribution = _bayesian.pq_stan( + self.stan_code, self.n_bins, self.pos_hist, self.neg_hist, test_hist, + self.num_samples, self.num_warmup, self.stan_seed + ) return F.as_binary_prevalence(self.prev_distribution.mean()) - def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): - classif_predictions = self.classify(instances) - point_estimate = self.aggregate(classif_predictions) + point_estimate = self.predict(instances) samples = self.prev_distribution region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region) return point_estimate, region diff --git a/quapy/method/stan/pq.stan b/quapy/method/stan/pq.stan index b759ddf..a5bc52a 100644 --- a/quapy/method/stan/pq.stan +++ b/quapy/method/stan/pq.stan @@ -35,4 +35,5 @@ model { generated quantities { real prev; prev = sum( binomial_rng(test, 1 / ( 1 + (p_neg./p_pos) *(1-prev_prior)/prev_prior ) ) ) / n_test; -} \ No newline at end of file +} + diff --git a/setup.py b/setup.py index ba5f205..c058f9d 100644 --- a/setup.py +++ b/setup.py @@ -124,7 +124,7 @@ setup( # Similar to `install_requires` above, these must be valid existing # projects. extras_require={ # Optional - 'bayes': ['jax', 'jaxlib', 'numpyro'], + 'bayes': ['jax', 'jaxlib', 'numpyro', 'pystan'], 'neural': ['torch'], 'tests': ['certifi'], 'docs' : ['sphinx-rtd-theme', 'myst-parser'], From 047cb9e533ed24cb05544a83df1feaf1372b1cfb Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sat, 15 Nov 2025 18:54:04 +0100 Subject: [PATCH 6/6] merged --- quapy/method/confidence.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index 3adcf38..fd8c84d 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -565,10 +565,12 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): return np.asarray(samples.mean(axis=0), dtype=float) def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + if confidence_level is None: + confidence_level = self.confidence_level classif_predictions = self.classify(instances) point_estimate = self.aggregate(classif_predictions) samples = self.get_prevalence_samples() # available after calling "aggregate" function - region = WithConfidenceABC.construct_region(samples, confidence_level=self.confidence_level, method=self.region) + region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region) return point_estimate, region @@ -606,6 +608,7 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): num_warmup: int = 500, num_samples: int = 1_000, stan_seed: int = 0, + confidence_level: float = 0.95, region: str = 'intervals'): if num_warmup <= 0: @@ -622,9 +625,10 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): self.fixed_bins = fixed_bins self.num_warmup = num_warmup self.num_samples = num_samples - self.region = region self.stan_seed = stan_seed self.stan_code = _bayesian.load_stan_file() + self.confidence_level = confidence_level + self.region = region def aggregation_fit(self, classif_predictions, labels): y_pred = classif_predictions[:, self.pos_label] @@ -651,16 +655,23 @@ class PQ(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): def aggregate(self, classif_predictions): Px_test = classif_predictions[:, self.pos_label] test_hist, _ = np.histogram(Px_test, bins=self.bin_limits) - self.prev_distribution = _bayesian.pq_stan( + prevs = _bayesian.pq_stan( self.stan_code, self.n_bins, self.pos_hist, self.neg_hist, test_hist, self.num_samples, self.num_warmup, self.stan_seed - ) - return F.as_binary_prevalence(self.prev_distribution.mean()) + ).flatten() + self.prev_distribution = np.vstack([1-prevs, prevs]).T + return self.prev_distribution.mean(axis=0) - def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): - classif_predictions = self.classify(instances) - point_estimate = self.aggregate(classif_predictions) + def aggregate_conf(self, predictions, confidence_level=None): + if confidence_level is None: + confidence_level = self.confidence_level + point_estimate = self.aggregate(predictions) samples = self.prev_distribution region = WithConfidenceABC.construct_region(samples, confidence_level=confidence_level, method=self.region) return point_estimate, region + def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + predictions = self.classify(instances) + return self.aggregate_conf(predictions, confidence_level=confidence_level) + +