This commit is contained in:
Alejandro Moreo Fernandez 2025-06-15 12:02:40 +02:00
commit 934750ea44
9 changed files with 47 additions and 32 deletions

View File

@ -9,6 +9,7 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from numpy.random import RandomState from numpy.random import RandomState
from quapy.functional import strprev from quapy.functional import strprev
from quapy.util import temp_seed from quapy.util import temp_seed
import functional as F
class LabelledCollection: class LabelledCollection:
@ -34,8 +35,7 @@ class LabelledCollection:
self.labels = np.asarray(labels) self.labels = np.asarray(labels)
n_docs = len(self) n_docs = len(self)
if classes is None: if classes is None:
self.classes_ = np.unique(self.labels) self.classes_ = F.classes_from_labels(self.labels)
self.classes_.sort()
else: else:
self.classes_ = np.unique(np.asarray(classes)) self.classes_ = np.unique(np.asarray(classes))
self.classes_.sort() self.classes_.sort()

View File

@ -7,6 +7,20 @@ import scipy
import numpy as np import numpy as np
# ------------------------------------------------------------------------------------------
# General utils
# ------------------------------------------------------------------------------------------
def classes_from_labels(labels):
"""
Obtains a np.ndarray with the (sorted) classes
:param labels:
:return:
"""
classes = np.unique(labels)
classes.sort()
return classes
# ------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------
# Counter utils # Counter utils
# ------------------------------------------------------------------------------------------ # ------------------------------------------------------------------------------------------

View File

@ -149,13 +149,13 @@ class QuaNetTrainer(BaseQuantifier):
train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_) train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_)
self.quantifiers = { self.quantifiers = {
'cc': CC(self.classifier).fit(None, fit_classifier=False), 'cc': CC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
'acc': ACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data), 'acc': ACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
'pcc': PCC(self.classifier).fit(None, fit_classifier=False), 'pcc': PCC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
'pacc': PACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data), 'pacc': PACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
} }
if classifier_data is not None: if classifier_data is not None:
self.quantifiers['emq'] = EMQ(self.classifier).fit(classifier_data, fit_classifier=False) self.quantifiers['emq'] = EMQ(self.classifier, fit_classifier=False).fit(*valid_data.Xy)
self.status = { self.status = {
'tr-loss': -1, 'tr-loss': -1,

View File

@ -100,7 +100,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
# consistency checks: fit_classifier? # consistency checks: fit_classifier?
if self.fit_classifier: if self.fit_classifier:
if fitted: if fitted:
raise RuntimeWarning(f'the classifier is already fitted, by {fit_classifier=} was requested') raise RuntimeWarning(f'the classifier is already fitted, but {fit_classifier=} was requested')
else: else:
assert fitted, (f'{fit_classifier=} requires the classifier to be already trained, ' assert fitted, (f'{fit_classifier=} requires the classifier to be already trained, '
f'but this does not seem to be') f'but this does not seem to be')
@ -158,7 +158,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
predictions, labels = None, None predictions, labels = None, None
if isinstance(self.val_split, int): if isinstance(self.val_split, int):
assert self.fit_classifier, f'unexpected value for {self.fit_classifier=}' assert self.fit_classifier, f'{self.__class__}: unexpected value for {self.fit_classifier=}'
num_folds = self.val_split num_folds = self.val_split
n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None) n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None)
predictions = cross_val_predict(self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method()) predictions = cross_val_predict(self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method())

View File

@ -46,7 +46,7 @@ class BaseQuantifier(BaseEstimator):
:param X: array-like :param X: array-like
:return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
""" """
... return self.predict(X)
class BinaryQuantifier(BaseQuantifier): class BinaryQuantifier(BaseQuantifier):

View File

@ -450,17 +450,13 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
:param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
the one indicated in `qp.environ['DEFAULT_CLS']` the one indicated in `qp.environ['DEFAULT_CLS']`
:param fit_classifier: whether to train the learner (default is True). Set to False if the
learner has been trained outside the quantifier.
:param val_split: specifies the data used for generating classifier predictions. This specification :param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set; or as an integer (default 5), indicating that the predictions be extracted from the training set; or as an integer (default 5), indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to
This hyperparameter is only meant to be used when the heuristics are to be applied, i.e., if a None when the method does not require any validation data, in order to avoid that some portion of
calibration is required. The default value is None (meaning the calibration is not required). In the training data be wasted.
case this hyperparameter is set to a value other than None, but the calibration is not required
(calib=None), a warning message will be raised.
:param num_warmup: number of warmup iterations for the MCMC sampler (default 500) :param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
:param num_samples: number of samples to draw from the posterior (default 1000) :param num_samples: number of samples to draw from the posterior (default 1000)
:param mcmc_seed: random seed for the MCMC sampler (default 0) :param mcmc_seed: random seed for the MCMC sampler (default 0)
@ -484,11 +480,9 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
if num_samples <= 0: if num_samples <= 0:
raise ValueError(f'parameter {num_samples=} must be a positive integer') raise ValueError(f'parameter {num_samples=} must be a positive integer')
# if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1:
# raise ValueError(f'val_split must be a float in (0, 1), got {val_split}')
if _bayesian.DEPENDENCIES_INSTALLED is False: if _bayesian.DEPENDENCIES_INSTALLED is False:
raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.") raise ImportError("Auxiliary dependencies are required. "
"Run `$ pip install quapy[bayes]` to install them.")
super().__init__(classifier, fit_classifier, val_split) super().__init__(classifier, fit_classifier, val_split)
self.num_warmup = num_warmup self.num_warmup = num_warmup
@ -514,8 +508,11 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
""" """
pred_labels = classif_predictions pred_labels = classif_predictions
true_labels = labels true_labels = labels
self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels, self._n_and_c_labeled = confusion_matrix(
labels=self.classifier.classes_) y_true=true_labels,
y_pred=pred_labels,
labels=self.classifier.classes_
).astype(float)
def sample_from_posterior(self, classif_predictions): def sample_from_posterior(self, classif_predictions):
if self._n_and_c_labeled is None: if self._n_and_c_labeled is None:

View File

@ -414,15 +414,15 @@ def _delayed_new_instance(args):
sample = data.sampling_from_index(sample_index) sample = data.sampling_from_index(sample_index)
if val_split is not None: if val_split is not None:
model.fit(sample, val_split=val_split) model.fit(*sample.Xy, val_split=val_split)
else: else:
model.fit(sample) model.fit(*sample.Xy)
tr_prevalence = sample.prevalence() tr_prevalence = sample.prevalence()
tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
if verbose: if verbose:
print(f'\t\--fit-ended for prev {F.strprev(prev)}') print(f'\t--fit-ended for prev {F.strprev(prev)}')
return (model, tr_prevalence, tr_distribution, sample if keep_samples else None) return (model, tr_prevalence, tr_distribution, sample if keep_samples else None)

View File

@ -20,14 +20,16 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
def __init__(self): def __init__(self):
self._classes_ = None self._classes_ = None
def fit(self, data: LabelledCollection): def fit(self, X, y):
""" """
Computes the training prevalence and stores it. Computes the training prevalence and stores it.
:param data: the training sample :param X: array-like of shape `(n_samples, n_features)`, the training instances
:param y: array-like of shape `(n_samples,)`, the labels
:return: self :return: self
""" """
self.estimated_prevalence = data.prevalence() self._classes_ = F.classes_from_labels(labels=y)
self.estimated_prevalence = F.prevalence_from_labels(y, classes=self._classes_)
return self return self
def predict(self, X): def predict(self, X):
@ -114,9 +116,10 @@ class DMx(BaseQuantifier):
""" """
self.nfeats = X.shape[1] self.nfeats = X.shape[1]
self.feat_ranges = _get_features_range(X) self.feat_ranges = _get_features_range(X)
n_classes = len(np.unique(y))
self.validation_distribution = np.asarray( self.validation_distribution = np.asarray(
[self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)] [self.__get_distributions(X[y==cat]) for cat in range(n_classes)]
) )
return self return self

View File

@ -80,7 +80,7 @@ class TestMethods(unittest.TestCase):
print(f'testing {base_quantifier} on dataset {dataset.name} with {policy=}') print(f'testing {base_quantifier} on dataset {dataset.name} with {policy=}')
ensemble = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1) ensemble = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1)
ensemble.fit(dataset.training) ensemble.fit(*dataset.training.Xy)
estim_prevalences = ensemble.predict(dataset.test.instances) estim_prevalences = ensemble.predict(dataset.test.instances)
self.assertTrue(check_prevalence_vector(estim_prevalences)) self.assertTrue(check_prevalence_vector(estim_prevalences))
@ -116,6 +116,7 @@ class TestMethods(unittest.TestCase):
print('testing', q) print('testing', q)
q.fit(*dataset.training.Xy) q.fit(*dataset.training.Xy)
estim_prevalences = q.predict(dataset.test.X) estim_prevalences = q.predict(dataset.test.X)
print(estim_prevalences)
self.assertTrue(check_prevalence_vector(estim_prevalences)) self.assertTrue(check_prevalence_vector(estim_prevalences))