This commit is contained in:
Alejandro Moreo Fernandez 2025-06-15 12:02:40 +02:00
commit 934750ea44
9 changed files with 47 additions and 32 deletions

View File

@ -9,6 +9,7 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from numpy.random import RandomState
from quapy.functional import strprev
from quapy.util import temp_seed
import functional as F
class LabelledCollection:
@ -34,8 +35,7 @@ class LabelledCollection:
self.labels = np.asarray(labels)
n_docs = len(self)
if classes is None:
self.classes_ = np.unique(self.labels)
self.classes_.sort()
self.classes_ = F.classes_from_labels(self.labels)
else:
self.classes_ = np.unique(np.asarray(classes))
self.classes_.sort()

View File

@ -7,6 +7,20 @@ import scipy
import numpy as np
# ------------------------------------------------------------------------------------------
# General utils
# ------------------------------------------------------------------------------------------
def classes_from_labels(labels):
"""
Obtains a np.ndarray with the (sorted) classes
:param labels:
:return:
"""
classes = np.unique(labels)
classes.sort()
return classes
# ------------------------------------------------------------------------------------------
# Counter utils
# ------------------------------------------------------------------------------------------

View File

@ -149,13 +149,13 @@ class QuaNetTrainer(BaseQuantifier):
train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_)
self.quantifiers = {
'cc': CC(self.classifier).fit(None, fit_classifier=False),
'acc': ACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data),
'pcc': PCC(self.classifier).fit(None, fit_classifier=False),
'pacc': PACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data),
'cc': CC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
'acc': ACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
'pcc': PCC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
'pacc': PACC(self.classifier, fit_classifier=False).fit(*valid_data.Xy),
}
if classifier_data is not None:
self.quantifiers['emq'] = EMQ(self.classifier).fit(classifier_data, fit_classifier=False)
self.quantifiers['emq'] = EMQ(self.classifier, fit_classifier=False).fit(*valid_data.Xy)
self.status = {
'tr-loss': -1,

View File

@ -100,7 +100,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
# consistency checks: fit_classifier?
if self.fit_classifier:
if fitted:
raise RuntimeWarning(f'the classifier is already fitted, by {fit_classifier=} was requested')
raise RuntimeWarning(f'the classifier is already fitted, but {fit_classifier=} was requested')
else:
assert fitted, (f'{fit_classifier=} requires the classifier to be already trained, '
f'but this does not seem to be')
@ -158,7 +158,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
predictions, labels = None, None
if isinstance(self.val_split, int):
assert self.fit_classifier, f'unexpected value for {self.fit_classifier=}'
assert self.fit_classifier, f'{self.__class__}: unexpected value for {self.fit_classifier=}'
num_folds = self.val_split
n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None)
predictions = cross_val_predict(self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method())

View File

@ -46,7 +46,7 @@ class BaseQuantifier(BaseEstimator):
:param X: array-like
:return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates.
"""
...
return self.predict(X)
class BinaryQuantifier(BaseQuantifier):

View File

@ -450,17 +450,13 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
:param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be
the one indicated in `qp.environ['DEFAULT_CLS']`
:param fit_classifier: whether to train the learner (default is True). Set to False if the
learner has been trained outside the quantifier.
:param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set; or as an integer (default 5), indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
This hyperparameter is only meant to be used when the heuristics are to be applied, i.e., if a
calibration is required. The default value is None (meaning the calibration is not required). In
case this hyperparameter is set to a value other than None, but the calibration is not required
(calib=None), a warning message will be raised.
for `k`); or as a tuple `(X,y)` defining the specific set of data to use for validation. Set to
None when the method does not require any validation data, in order to avoid that some portion of
the training data be wasted.
:param num_warmup: number of warmup iterations for the MCMC sampler (default 500)
:param num_samples: number of samples to draw from the posterior (default 1000)
:param mcmc_seed: random seed for the MCMC sampler (default 0)
@ -484,11 +480,9 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
if num_samples <= 0:
raise ValueError(f'parameter {num_samples=} must be a positive integer')
# if (not isinstance(val_split, float)) or val_split <= 0 or val_split >= 1:
# raise ValueError(f'val_split must be a float in (0, 1), got {val_split}')
if _bayesian.DEPENDENCIES_INSTALLED is False:
raise ImportError("Auxiliary dependencies are required. Run `$ pip install quapy[bayes]` to install them.")
raise ImportError("Auxiliary dependencies are required. "
"Run `$ pip install quapy[bayes]` to install them.")
super().__init__(classifier, fit_classifier, val_split)
self.num_warmup = num_warmup
@ -514,8 +508,11 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC):
"""
pred_labels = classif_predictions
true_labels = labels
self._n_and_c_labeled = confusion_matrix(y_true=true_labels, y_pred=pred_labels,
labels=self.classifier.classes_)
self._n_and_c_labeled = confusion_matrix(
y_true=true_labels,
y_pred=pred_labels,
labels=self.classifier.classes_
).astype(float)
def sample_from_posterior(self, classif_predictions):
if self._n_and_c_labeled is None:

View File

@ -414,15 +414,15 @@ def _delayed_new_instance(args):
sample = data.sampling_from_index(sample_index)
if val_split is not None:
model.fit(sample, val_split=val_split)
model.fit(*sample.Xy, val_split=val_split)
else:
model.fit(sample)
model.fit(*sample.Xy)
tr_prevalence = sample.prevalence()
tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
if verbose:
print(f'\t\--fit-ended for prev {F.strprev(prev)}')
print(f'\t--fit-ended for prev {F.strprev(prev)}')
return (model, tr_prevalence, tr_distribution, sample if keep_samples else None)

View File

@ -20,14 +20,16 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
def __init__(self):
self._classes_ = None
def fit(self, data: LabelledCollection):
def fit(self, X, y):
"""
Computes the training prevalence and stores it.
:param data: the training sample
:param X: array-like of shape `(n_samples, n_features)`, the training instances
:param y: array-like of shape `(n_samples,)`, the labels
:return: self
"""
self.estimated_prevalence = data.prevalence()
self._classes_ = F.classes_from_labels(labels=y)
self.estimated_prevalence = F.prevalence_from_labels(y, classes=self._classes_)
return self
def predict(self, X):
@ -114,9 +116,10 @@ class DMx(BaseQuantifier):
"""
self.nfeats = X.shape[1]
self.feat_ranges = _get_features_range(X)
n_classes = len(np.unique(y))
self.validation_distribution = np.asarray(
[self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)]
[self.__get_distributions(X[y==cat]) for cat in range(n_classes)]
)
return self

View File

@ -80,7 +80,7 @@ class TestMethods(unittest.TestCase):
print(f'testing {base_quantifier} on dataset {dataset.name} with {policy=}')
ensemble = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1)
ensemble.fit(dataset.training)
ensemble.fit(*dataset.training.Xy)
estim_prevalences = ensemble.predict(dataset.test.instances)
self.assertTrue(check_prevalence_vector(estim_prevalences))
@ -116,6 +116,7 @@ class TestMethods(unittest.TestCase):
print('testing', q)
q.fit(*dataset.training.Xy)
estim_prevalences = q.predict(dataset.test.X)
print(estim_prevalences)
self.assertTrue(check_prevalence_vector(estim_prevalences))