From 448d60ac42b26e0213a95fe6262916c99185cde9 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Wed, 6 Mar 2024 11:53:43 +0100 Subject: [PATCH 1/4] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index d9f697c..b08da3a 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,6 @@ for facilitating the analysis and interpretation of the experimental results. ### Last updates: * Version 0.1.8 is released! major changes can be consulted [here](CHANGE_LOG.txt). -* A detailed documentation is now available [here](https://hlt-isti.github.io/QuaPy/) * The developer API documentation is available [here](https://hlt-isti.github.io/QuaPy/build/html/modules.html) ### Installation From 3705264529a4d589f9a066894b22b433cd340d90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Czy=C5=BC?= Date: Thu, 14 Mar 2024 10:39:26 +0100 Subject: [PATCH 2/4] Fix a typo. --- quapy/method/aggregative.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 27d692d..feeb5f2 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -249,7 +249,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC): class AggregativeCrispQuantifier(AggregativeQuantifier, ABC): """ - Abstract class for quantification methods that base their estimations on the aggregation of crips decisions + Abstract class for quantification methods that base their estimations on the aggregation of crisp decisions as returned by a hard classifier. Aggregative crisp quantifiers thus extend Aggregative Quantifiers by implementing specifications about crisp predictions. """ From 2cc49083262f0a603fc2e6c8415ae2f7a422691b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Czy=C5=BC?= Date: Fri, 15 Mar 2024 14:01:24 +0100 Subject: [PATCH 3/4] Sketch of the Bayesian quantification --- quapy/functional.py | 26 ++++++++--- quapy/method/_bayesian.py | 78 ++++++++++++++++++++++++++++++++ quapy/method/aggregative.py | 89 ++++++++++++++++++++++++++++++++++++- setup.py | 7 ++- 4 files changed, 188 insertions(+), 12 deletions(-) create mode 100644 quapy/method/_bayesian.py diff --git a/quapy/functional.py b/quapy/functional.py index c6dc351..3a4ebfa 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -28,22 +28,34 @@ def prevalence_linspace(n_prevalences=21, repeats=1, smooth_limits_epsilon=0.01) return p -def prevalence_from_labels(labels, classes): +def counts_from_labels(labels, classes): """ - Computed the prevalence values from a vector of labels. + Computes the count values from a vector of labels. - :param labels: array-like of shape `(n_instances)` with the label for each instance + :param labels: array-like of shape `(n_instances,)` with the label for each instance :param classes: the class labels. This is needed in order to correctly compute the prevalence vector even when some classes have no examples. - :return: an ndarray of shape `(len(classes))` with the class prevalence values + :return: an ndarray of shape `(len(classes),)` with the occurrence counts of each class """ if labels.ndim != 1: raise ValueError(f'param labels does not seem to be a ndarray of label predictions') unique, counts = np.unique(labels, return_counts=True) by_class = defaultdict(lambda:0, dict(zip(unique, counts))) - prevalences = np.asarray([by_class[class_] for class_ in classes], dtype=float) - prevalences /= prevalences.sum() - return prevalences + counts = np.asarray([by_class[class_] for class_ in classes], dtype=int) + return counts + + +def prevalence_from_labels(labels, classes): + """ + Computes the prevalence values from a vector of labels. + + :param labels: array-like of shape `(n_instances,)` with the label for each instance + :param classes: the class labels. This is needed in order to correctly compute the prevalence vector even when + some classes have no examples. + :return: an ndarray of shape `(len(classes))` with the class prevalence values + """ + counts = np.array(counts_from_labels(labels, classes), dtype=float) + return counts / np.sum(counts) def prevalence_from_probabilities(posteriors, binarize: bool = False): diff --git a/quapy/method/_bayesian.py b/quapy/method/_bayesian.py new file mode 100644 index 0000000..78a2c66 --- /dev/null +++ b/quapy/method/_bayesian.py @@ -0,0 +1,78 @@ +""" +Utility functions for `Bayesian quantification `_ methods. +""" +import numpy as np + +try: + import jax + import jax.numpy as jnp + import numpyro + import numpyro.distributions as dist + + DEPENDENCIES_INSTALLED = True +except ImportError: + jax = None + jnp = None + numpyro = None + dist = None + + DEPENDENCIES_INSTALLED = False + + +P_TEST_Y: str = "P_test(Y)" +P_TEST_C: str = "P_test(C)" +P_C_COND_Y: str = "P(C|Y)" + + +def model(n_c_unlabeled: np.ndarray, n_y_and_c_labeled: np.ndarray) -> None: + """ + Defines a probabilistic model in `NumPyro `_. + + :param n_c_unlabeled: a `np.ndarray` of shape `(n_predicted_classes,)` + with entry `c` being the number of instances predicted as class `c`. + :param n_y_and_c_labeled: a `np.ndarray` of shape `(n_classes, n_predicted_classes)` + with entry `(y, c)` being the number of instances labeled as class `y` and predicted as class `c`. + """ + n_y_labeled = n_y_and_c_labeled.sum(axis=1) + + K = len(n_c_unlabeled) + L = len(n_y_labeled) + + pi_ = numpyro.sample(P_TEST_Y, dist.Dirichlet(jnp.ones(L))) + p_c_cond_y = numpyro.sample(P_C_COND_Y, dist.Dirichlet(jnp.ones(K).repeat(L).reshape(L, K))) + + with numpyro.plate('plate', L): + numpyro.sample('F_yc', dist.Multinomial(n_y_labeled, p_c_cond_y), obs=n_y_and_c_labeled) + + p_c = numpyro.deterministic(P_TEST_C, jnp.einsum("yc,y->c", p_c_cond_y, pi_)) + numpyro.sample('N_c', dist.Multinomial(jnp.sum(n_c_unlabeled), p_c), obs=n_c_unlabeled) + + +def sample_posterior( + n_c_unlabeled: np.ndarray, + n_y_and_c_labeled: np.ndarray, + num_warmup: int, + num_samples: int, + seed: int = 0, +) -> dict: + """ + Samples from the Bayesian quantification model in NumPyro using the + `NUTS `_ sampler. + + :param n_c_unlabeled: a `np.ndarray` of shape `(n_predicted_classes,)` + with entry `c` being the number of instances predicted as class `c`. + :param n_y_and_c_labeled: a `np.ndarray` of shape `(n_classes, n_predicted_classes)` + with entry `(y, c)` being the number of instances labeled as class `y` and predicted as class `c`. + :param num_warmup: the number of warmup steps. + :param num_samples: the number of samples to draw. + :seed: the random seed. + :return: a `dict` with the samples. The keys are the names of the latent variables. + """ + mcmc = numpyro.infer.MCMC( + numpyro.infer.NUTS(model), + num_warmup=num_warmup, + num_samples=num_samples, + ) + rng_key = jax.random.PRNGKey(seed) + mcmc.run(rng_key, n_c_unlabeled=n_c_unlabeled, n_y_and_c_labeled=n_y_and_c_labeled) + return mcmc.get_samples() diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index feeb5f2..a608053 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -11,6 +11,7 @@ from sklearn.model_selection import cross_val_predict import quapy as qp import quapy.functional as F +import quapy._bayesian as _bayesian from quapy.functional import get_divergence from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration from quapy.classification.svmperf import SVMperf @@ -384,7 +385,8 @@ class ACC(AggregativeCrispQuantifier): self.solver = solver def _check_init_parameters(self): - assert self.solver in ['exact', 'minimize'], "unknown solver; valid ones are 'exact', 'minimize'" + if self.solver not in ['exact', 'minimize']: + raise ValueError("unknown solver; valid ones are 'exact', 'minimize'") def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): """ @@ -453,6 +455,91 @@ class ACC(AggregativeCrispQuantifier): return F.optim_minimize(loss, n_classes=A.shape[0]) +class BayesianCC(AggregativeCrispQuantifier): + """ + `Bayesian quantification `_ methods, + which is a variant of :class`ACC` that calculates the posterior probability distribution + over the prevalence vectors, rather than providing a point estimate obtained + by matrix inversion. + + Can be used to diagnose degeneracy in the predictions visible when the confusion + matrix has high condition number or to quantify uncertainty around the point estimate. + + This method relies on extra dependencies, which have to be installed via: + `$ pip install quapy[bayes]` + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: specifies the data used for generating classifier predictions. This specification + should be a float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set + :num_warmup: number of warmup iterations for the MCMC sampler + :num_samples: number of samples to draw from the posterior + :mcmc_seed: random seed for the MCMC sampler + """ + def __init__(self, classifier: BaseEstimator, val_split: float = 0.75, num_warmup: int = 500, num_samples: int = 1_000, mcmc_seed: int = 0) -> None: + if num_warmup <= 0: + raise ValueError(f'num_warmup must be a positive integer, got {num_warmup}') + if num_samples <= 0: + raise ValueError(f'num_samples must be a positive integer, got {num_samples}') + + if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1: + raise ValueError(f'val_split must be a float in (0, 1), got {val_split}') + + if _bayesian.DEPENDENCIES_INSTALLED is False: + raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.") + + self.classifier = classifier + self.val_split = val_split + self.num_warmup = num_warmup + self.num_samples = num_samples + self.mcmc_seed = mcmc_seed + + # Array of shape (n_classes, n_predicted_classes) where entry (y, c) is the number of instances labeled as class y and predicted as class c + # By default it's None and it's set during the `aggregation_fit` phase + self._n_and_c_labeled = None + + # Dictionary with posterior samples, set when `aggregate` is provided. + self._samples = None + + def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Estimates the misclassification rates. + + :param classif_predictions: classifier predictions with true labels + """ + pred_labels, true_labels = classif_predictions.Xy + self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels, labels=self.classifier.classes_) + + def sample_from_posterior(self, classif_predictions): + if self._n_and_c_labeled is None: + raise ValueError("aggregation_fit must be called before sample_from_posterior") + + n_c_unlabeled = F.counts_from_labels(classif_predictions, self.classifier.classes_) + + self._samples = _bayesian.sample_posterior( + n_c_unlabeled=n_c_unlabeled, + n_y_and_c_labeled=self._n_and_c_labeled, + num_warmup=self.num_warmup, + num_samples=self.num_samples, + seed=self.mcmc_seed, + ) + return self._samples + + def get_prevalence_samples(self): + if self._samples is None: + raise ValueError("sample_from_posterior must be called before get_prevalence_samples") + return self._samples[_bayesian.P_TEST_Y] + + def get_conditional_probability_samples(self): + if self._samples is None: + raise ValueError("sample_from_posterior must be called before get_conditional_probability_samples") + return self._samples[_bayesian.P_C_COND_Y] + + def aggregate(self, classif_predictions): + samples = self.sample_from_posterior(classif_predictions)[_bayesian.P_TEST_Y] + return np.asarray(samples.mean(axis=0), dtype=float) + + class PCC(AggregativeSoftQuantifier): """ `Probabilistic Classify & Count `_, diff --git a/setup.py b/setup.py index 9ccb348..1f6c6fb 100644 --- a/setup.py +++ b/setup.py @@ -123,10 +123,9 @@ setup( # # Similar to `install_requires` above, these must be valid existing # projects. - # extras_require={ # Optional - # 'dev': ['check-manifest'], - # 'test': ['coverage'], - # }, + extras_require={ # Optional + 'bayes': ['jax', 'jaxlib', 'numpyro'], + }, # If there are data files included in your packages that need to be # installed, specify them here. From 3921b8368e2b50fbf96c595e8458b4360b9b7867 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Fri, 15 Mar 2024 16:24:45 +0100 Subject: [PATCH 4/4] merging BayesianCC implemented by Pawel Czyz --- quapy/method/__init__.py | 4 +- quapy/method/_bayesian.py | 1 + quapy/method/aggregative.py | 246 ++++++++++++++++++-------------- quapy/method/non_aggregative.py | 1 + 4 files changed, 143 insertions(+), 109 deletions(-) diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index de57d96..51b02c2 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -20,11 +20,13 @@ AGGREGATIVE_METHODS = { aggregative.KDEyML, aggregative.KDEyCS, aggregative.KDEyHD, + aggregative.BayesianCC } NON_AGGREGATIVE_METHODS = { - non_aggregative.MaximumLikelihoodPrevalenceEstimation + non_aggregative.MaximumLikelihoodPrevalenceEstimation, + non_aggregative.DMx } META_METHODS = { diff --git a/quapy/method/_bayesian.py b/quapy/method/_bayesian.py index 78a2c66..c783f10 100644 --- a/quapy/method/_bayesian.py +++ b/quapy/method/_bayesian.py @@ -72,6 +72,7 @@ def sample_posterior( numpyro.infer.NUTS(model), num_warmup=num_warmup, num_samples=num_samples, + progress_bar=False ) rng_key = jax.random.PRNGKey(seed) mcmc.run(rng_key, n_c_unlabeled=n_c_unlabeled, n_y_and_c_labeled=n_y_and_c_labeled) diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index a608053..da98358 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -11,12 +11,13 @@ from sklearn.model_selection import cross_val_predict import quapy as qp import quapy.functional as F -import quapy._bayesian as _bayesian from quapy.functional import get_divergence from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration from quapy.classification.svmperf import SVMperf from quapy.data import LabelledCollection from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric +from quapy.method import _bayesian + # Abstract classes @@ -163,8 +164,8 @@ class AggregativeQuantifier(BaseQuantifier, ABC): """ Trains the aggregation function. - :param classif_predictions: a LabelledCollection containing the label predictions issued - by the classifier + :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, + as instances, the predictions issued by the classifier and, as labels, the true labels :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data """ ... @@ -336,7 +337,8 @@ class CC(AggregativeCrispQuantifier): """ Nothing to do here! - :param classif_predictions: this is actually None + :param classif_predictions: not used + :param data: not used """ pass @@ -392,7 +394,9 @@ class ACC(AggregativeCrispQuantifier): """ Estimates the misclassification rates. - :param classif_predictions: classifier predictions with true labels + :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, + as instances, the label predictions issued by the classifier and, as labels, the true labels + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data """ pred_labels, true_labels = classif_predictions.Xy self.cc = CC(self.classifier) @@ -455,91 +459,6 @@ class ACC(AggregativeCrispQuantifier): return F.optim_minimize(loss, n_classes=A.shape[0]) -class BayesianCC(AggregativeCrispQuantifier): - """ - `Bayesian quantification `_ methods, - which is a variant of :class`ACC` that calculates the posterior probability distribution - over the prevalence vectors, rather than providing a point estimate obtained - by matrix inversion. - - Can be used to diagnose degeneracy in the predictions visible when the confusion - matrix has high condition number or to quantify uncertainty around the point estimate. - - This method relies on extra dependencies, which have to be installed via: - `$ pip install quapy[bayes]` - - :param classifier: a sklearn's Estimator that generates a classifier - :param val_split: specifies the data used for generating classifier predictions. This specification - should be a float in (0, 1) indicating the proportion of stratified held-out validation set to - be extracted from the training set - :num_warmup: number of warmup iterations for the MCMC sampler - :num_samples: number of samples to draw from the posterior - :mcmc_seed: random seed for the MCMC sampler - """ - def __init__(self, classifier: BaseEstimator, val_split: float = 0.75, num_warmup: int = 500, num_samples: int = 1_000, mcmc_seed: int = 0) -> None: - if num_warmup <= 0: - raise ValueError(f'num_warmup must be a positive integer, got {num_warmup}') - if num_samples <= 0: - raise ValueError(f'num_samples must be a positive integer, got {num_samples}') - - if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1: - raise ValueError(f'val_split must be a float in (0, 1), got {val_split}') - - if _bayesian.DEPENDENCIES_INSTALLED is False: - raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.") - - self.classifier = classifier - self.val_split = val_split - self.num_warmup = num_warmup - self.num_samples = num_samples - self.mcmc_seed = mcmc_seed - - # Array of shape (n_classes, n_predicted_classes) where entry (y, c) is the number of instances labeled as class y and predicted as class c - # By default it's None and it's set during the `aggregation_fit` phase - self._n_and_c_labeled = None - - # Dictionary with posterior samples, set when `aggregate` is provided. - self._samples = None - - def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): - """ - Estimates the misclassification rates. - - :param classif_predictions: classifier predictions with true labels - """ - pred_labels, true_labels = classif_predictions.Xy - self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels, labels=self.classifier.classes_) - - def sample_from_posterior(self, classif_predictions): - if self._n_and_c_labeled is None: - raise ValueError("aggregation_fit must be called before sample_from_posterior") - - n_c_unlabeled = F.counts_from_labels(classif_predictions, self.classifier.classes_) - - self._samples = _bayesian.sample_posterior( - n_c_unlabeled=n_c_unlabeled, - n_y_and_c_labeled=self._n_and_c_labeled, - num_warmup=self.num_warmup, - num_samples=self.num_samples, - seed=self.mcmc_seed, - ) - return self._samples - - def get_prevalence_samples(self): - if self._samples is None: - raise ValueError("sample_from_posterior must be called before get_prevalence_samples") - return self._samples[_bayesian.P_TEST_Y] - - def get_conditional_probability_samples(self): - if self._samples is None: - raise ValueError("sample_from_posterior must be called before get_conditional_probability_samples") - return self._samples[_bayesian.P_C_COND_Y] - - def aggregate(self, classif_predictions): - samples = self.sample_from_posterior(classif_predictions)[_bayesian.P_TEST_Y] - return np.asarray(samples.mean(axis=0), dtype=float) - - class PCC(AggregativeSoftQuantifier): """ `Probabilistic Classify & Count `_, @@ -555,7 +474,8 @@ class PCC(AggregativeSoftQuantifier): """ Nothing to do here! - :param classif_predictions: this is actually None + :param classif_predictions: not used + :param data: not used """ pass @@ -603,7 +523,9 @@ class PACC(AggregativeSoftQuantifier): """ Estimates the misclassification rates - :param classif_predictions: classifier soft predictions with true labels + :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, + as instances, the posterior probabilities issued by the classifier and, as labels, the true labels + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data """ posteriors, true_labels = classif_predictions.Xy self.pcc = PCC(self.classifier) @@ -713,6 +635,14 @@ class EMQ(AggregativeSoftQuantifier): return posteriors def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Trains the aggregation function of EMQ. This comes down to recalibrating the posterior probabilities + ir requested. + + :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, + as instances, the posterior probabilities issued by the classifier and, as labels, the true labels + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + """ if self.recalib is not None: P, y = classif_predictions.Xy if self.recalib == 'nbvs': @@ -799,6 +729,99 @@ class EMQ(AggregativeSoftQuantifier): return qs, ps +class BayesianCC(AggregativeCrispQuantifier): + """ + `Bayesian quantification `_ method, + which is a variant of :class:`ACC` that calculates the posterior probability distribution + over the prevalence vectors, rather than providing a point estimate obtained + by matrix inversion. + + Can be used to diagnose degeneracy in the predictions visible when the confusion + matrix has high condition number or to quantify uncertainty around the point estimate. + + This method relies on extra dependencies, which have to be installed via: + `$ pip install quapy[bayes]` + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: a float in (0, 1) indicating the proportion of the training data to be used, + as a stratified held-out validation set, for generating classifier predictions. + :param num_warmup: number of warmup iterations for the MCMC sampler (default 500) + :param num_samples: number of samples to draw from the posterior (default 1000) + :param mcmc_seed: random seed for the MCMC sampler (default 0) + """ + def __init__(self, + classifier: BaseEstimator, + val_split: float = 0.75, + num_warmup: int = 500, + num_samples: int = 1_000, + mcmc_seed: int = 0): + + if num_warmup <= 0: + raise ValueError(f'parameter {num_warmup=} must be a positive integer') + if num_samples <= 0: + raise ValueError(f'parameter {num_samples=} must be a positive integer') + + if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1: + raise ValueError(f'val_split must be a float in (0, 1), got {val_split}') + + if _bayesian.DEPENDENCIES_INSTALLED is False: + raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.") + + self.classifier = classifier + self.val_split = val_split + self.num_warmup = num_warmup + self.num_samples = num_samples + self.mcmc_seed = mcmc_seed + + # Array of shape (n_classes, n_predicted_classes,) where entry (y, c) is the number of instances + # labeled as class y and predicted as class c. + # By default, this array is set to None and later defined as part of the `aggregation_fit` phase + self._n_and_c_labeled = None + + # Dictionary with posterior samples, set when `aggregate` is provided. + self._samples = None + + def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Estimates the misclassification rates. + + :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, + as instances, the label predictions issued by the classifier and, as labels, the true labels + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + """ + pred_labels, true_labels = classif_predictions.Xy + self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels, labels=self.classifier.classes_) + + def sample_from_posterior(self, classif_predictions): + if self._n_and_c_labeled is None: + raise ValueError("aggregation_fit must be called before sample_from_posterior") + + n_c_unlabeled = F.counts_from_labels(classif_predictions, self.classifier.classes_) + + self._samples = _bayesian.sample_posterior( + n_c_unlabeled=n_c_unlabeled, + n_y_and_c_labeled=self._n_and_c_labeled, + num_warmup=self.num_warmup, + num_samples=self.num_samples, + seed=self.mcmc_seed, + ) + return self._samples + + def get_prevalence_samples(self): + if self._samples is None: + raise ValueError("sample_from_posterior must be called before get_prevalence_samples") + return self._samples[_bayesian.P_TEST_Y] + + def get_conditional_probability_samples(self): + if self._samples is None: + raise ValueError("sample_from_posterior must be called before get_conditional_probability_samples") + return self._samples[_bayesian.P_C_COND_Y] + + def aggregate(self, classif_predictions): + samples = self.sample_from_posterior(classif_predictions)[_bayesian.P_TEST_Y] + return np.asarray(samples.mean(axis=0), dtype=float) + + class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): """ `Hellinger Distance y `_ (HDy). @@ -820,14 +843,11 @@ class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): """ - Trains a HDy quantifier. + Trains the aggregation function of HDy. - :param data: the training set - :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit) - :param val_split: either a float in (0,1) indicating the proportion of training instances to use for - validation (e.g., 0.3 for using 30% of the training set as validation data), or a - :class:`quapy.data.base.LabelledCollection` indicating the validation set itself - :return: self + :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, + as instances, the posterior probabilities issued by the classifier and, as labels, the true labels + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data """ P, y = classif_predictions.Xy Px = P[:, self.pos_label] # takes only the P(y=+1|x) @@ -844,8 +864,6 @@ class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): self.Pxy1_density = {bins: hist(self.Pxy1, bins) for bins in self.bins} self.Pxy0_density = {bins: hist(self.Pxy0, bins) for bins in self.bins} - return self - def aggregate(self, classif_posteriors): # "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10, # and the final estimated a priori probability was taken as the median of these 11 estimates." @@ -920,6 +938,13 @@ class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): return (left + right) / 2 def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Trains the aggregation function of DyS. + + :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, + as instances, the posterior probabilities issued by the classifier and, as labels, the true labels + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + """ Px, y = classif_predictions.Xy Px = Px[:, self.pos_label] # takes only the P(y=+1|x) self.Pxy1 = Px[y == self.pos_label] @@ -958,6 +983,13 @@ class SMM(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): self.val_split = val_split def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Trains the aggregation function of SMM. + + :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, + as instances, the posterior probabilities issued by the classifier and, as labels, the true labels + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + """ Px, y = classif_predictions.Xy Px = Px[:, self.pos_label] # takes only the P(y=+1|x) self.Pxy1 = Px[y == self.pos_label] @@ -1031,19 +1063,17 @@ class DMy(AggregativeSoftQuantifier): def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): """ - Trains the classifier (if requested) and generates the validation distributions out of the training data. + Trains the aggregation function of a distribution matching method. This comes down to generating the + validation distributions out of the training data. The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; then `di=V[i]` are the distributions obtained from training data labelled with class `i`; while `dij = di[j]` is the discrete distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin. - :param data: the training set - :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit) - :param val_split: either a float in (0,1) indicating the proportion of training instances to use for - validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection - indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV - to estimate the parameters + :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, + as instances, the posterior probabilities issued by the classifier and, as labels, the true labels + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data """ posteriors, true_labels = classif_predictions.Xy n_classes = len(self.classifier.classes_) diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py index 02f133b..4104a3f 100644 --- a/quapy/method/non_aggregative.py +++ b/quapy/method/non_aggregative.py @@ -150,6 +150,7 @@ class DMx(BaseQuantifier): class ReadMe(BaseQuantifier): def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs): + raise NotImplementedError('under development ...') self.bootstrap_trials = bootstrap_trials self.bootstrap_range = bootstrap_range self.bagging_trials = bagging_trials