refactoring the aggregative quantifiers

This commit is contained in:
Alejandro Moreo Fernandez 2023-11-12 14:45:03 +01:00
parent 25f1cc29a3
commit 0a6185d908
2 changed files with 85 additions and 174 deletions

View File

@ -44,12 +44,11 @@ class AggregativeQuantifier(ABC, BaseQuantifier):
learner has been trained outside the quantifier. learner has been trained outside the quantifier.
:return: self :return: self
""" """
classif_predictions = self.classification_fit(data, fit_classifier) classif_predictions = self.classifier_fit_predict(data, fit_classifier)
self.aggregation_fit(classif_predictions) self.aggregation_fit(classif_predictions)
return self return self
@abstractmethod def classifier_fit_predict(self, data: LabelledCollection, fit_classifier=True, predict_on=None):
def classification_fit(self, data: LabelledCollection, fit_classifier=True):
""" """
Trains the classifier if requested (`fit_classifier=True`) and generate the necessary predictions to Trains the classifier if requested (`fit_classifier=True`) and generate the necessary predictions to
train the aggregation function. train the aggregation function.
@ -57,11 +56,62 @@ class AggregativeQuantifier(ABC, BaseQuantifier):
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
:param fit_classifier: whether to train the learner (default is True). Set to False if the :param fit_classifier: whether to train the learner (default is True). Set to False if the
learner has been trained outside the quantifier. learner has been trained outside the quantifier.
:param predict_on: specifies the set on which predictions need to be issued. This parameter can
be specified as None (default) to indicate no prediction is needed; a float in (0, 1) to
indicate the proportion of instances to be used for predictions (the remainder is used for
training); an integer >1 to indicate that the predictions must be generated via k-fold
cross-validation, using this integer as k; or the data sample itself on which to generate
the predictions.
""" """
... assert isinstance(fit_classifier, bool), 'unexpected type for "fit_classifier", must be boolean'
self.__check_classifier()
if predict_on is None:
if fit_classifier:
self.classifier.fit(*data.Xy)
predictions = None
elif isinstance(predict_on, float):
if fit_classifier:
if not (0. < predict_on < 1.):
raise ValueError(f'proportion {predict_on=} out of range, must be in (0,1)')
train, val = data.split_stratified(train_prop=(1 - predict_on))
self.classifier.fit(*train.Xy)
predictions = (self.classify(val.X), val.y)
else:
raise ValueError(f'wrong type for predict_on: since fit_classifier=False, '
f'the set on which predictions have to be issued must be '
f'explicitly indicated')
elif isinstance(predict_on, LabelledCollection):
if fit_classifier:
self.classifier.fit(*data.Xy)
predictions = (self.classify(predict_on.X), predict_on.y)
elif isinstance(predict_on, int):
if fit_classifier:
if not predict_on > 1:
raise ValueError(f'invalid value {predict_on} in fit. '
f'Specify a integer >1 for kFCV estimation.')
predictions = cross_val_predict(
classifier, *data.Xy, cv=predict_on, n_jobs=self.n_jobs, method=self.__classifier_method())
self.classifier.fit(*data.Xy)
else:
raise ValueError(f'wrong type for predict_on: since fit_classifier=False, '
f'the set on which predictions have to be issued must be '
f'explicitly indicated')
else:
raise ValueError(
f'error: param "predict_on" ({type(predict_on)}) not understood; '
f'use either a float indicating the split proportion, or a '
f'tuple (X,y) indicating the validation partition')
return predictions
@abstractmethod @abstractmethod
def aggregation_fit(self, classif_predictions): def aggregation_fit(self, classif_predictions: LabelledCollection):
""" """
Trains the aggregation function. Trains the aggregation function.
@ -99,6 +149,13 @@ class AggregativeQuantifier(ABC, BaseQuantifier):
""" """
return self.classifier.predict(instances) return self.classifier.predict(instances)
@property
def __classifier_method(self):
return 'predict'
def __check_classifier(self, adapt_if_necessary=False):
assert hasattr(self.classifier, 'predict')
def quantify(self, instances): def quantify(self, instances):
""" """
Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated
@ -142,106 +199,20 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier, ABC):
def classify(self, instances): def classify(self, instances):
return self.classifier.predict_proba(instances) return self.classifier.predict_proba(instances)
@property
def __classifier_method(self):
return 'predict_proba'
# Helper def __check_classifier(self, adapt_if_necessary=False):
# ------------------------------------ if not hasattr(self.classifier, 'predict_proba'):
def _ensure_probabilistic(classifier): if adapt_if_necessary:
if not hasattr(classifier, 'predict_proba'): print(f'warning: The learner {self.classifier.__class__.__name__} does not seem to be '
print(f'The learner {classifier.__class__.__name__} does not seem to be probabilistic. ' f'probabilistic. The learner will be calibrated (using CalibratedClassifierCV).')
f'The learner will be calibrated.') self.classifier = CalibratedClassifierCV(self.classifier, cv=5)
classifier = CalibratedClassifierCV(classifier, cv=5)
return classifier
def _training_helper(classifier,
data: LabelledCollection,
fit_classifier: bool = True,
ensure_probabilistic=False,
val_split: Union[LabelledCollection, float] = None):
"""
Training procedure common to all Aggregative Quantifiers.
:param classifier: the learner to be fit
:param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
:param fit_classifier: whether or not to fit the learner (if False, then bypasses any action)
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
learner is not probabilistic, then a CalibratedCV instance of it is trained)
:param val_split: if specified as a float, indicates the proportion of training instances that will define the
validation split (e.g., 0.3 for using 30% of the training set as validation data); if specified as a
LabelledCollection, represents the validation split itself
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
or None otherwise) to be used as a validation set for any subsequent parameter fitting
"""
if fit_classifier:
if ensure_probabilistic:
classifier = _ensure_probabilistic(classifier)
if val_split is not None:
if isinstance(val_split, float):
if not (0 < val_split < 1):
raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)')
train, unused = data.split_stratified(train_prop=1 - val_split)
elif isinstance(val_split, LabelledCollection):
train = data
unused = val_split
else: else:
raise ValueError( raise AssertionError(f'error: The learner {self.classifier.__class__.__name__} does not '
f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split ' f'seem to be probabilistic. The learner cannot be calibrated since '
'proportion, or a LabelledCollection indicating the validation split') f'fit_classifier is set to False')
else:
train, unused = data, None
if isinstance(classifier, BaseQuantifier):
classifier.fit(train)
else:
classifier.fit(*train.Xy)
else:
if ensure_probabilistic:
if not hasattr(classifier, 'predict_proba'):
raise AssertionError('error: the learner cannot be calibrated since fit_classifier is set to False')
unused = None
if isinstance(val_split, LabelledCollection):
unused = val_split
return classifier, unused
def cross_generate_predictions(
data,
classifier,
val_split,
probabilistic,
fit_classifier,
n_jobs
):
n_jobs = qp._get_njobs(n_jobs)
if isinstance(val_split, int):
assert fit_classifier == True, \
'the parameters for the adjustment cannot be estimated with kFCV with fit_classifier=False'
if probabilistic:
classifier = _ensure_probabilistic(classifier)
predict = 'predict_proba'
else:
predict = 'predict'
y_pred = cross_val_predict(classifier, *data.Xy, cv=val_split, n_jobs=n_jobs, method=predict)
class_count = data.counts()
# fit the learner on all data
classifier.fit(*data.Xy)
y = data.y
classes = data.classes_
else:
classifier, val_data = _training_helper(
classifier, data, fit_classifier, ensure_probabilistic=probabilistic, val_split=val_split
)
y_pred = classifier.predict_proba(val_data.instances) if probabilistic else classifier.predict(val_data.instances)
y = val_data.labels
classes = val_data.classes_
class_count = val_data.counts()
return classifier, y, y_pred, classes, class_count
# Methods # Methods
@ -257,19 +228,7 @@ class CC(AggregativeQuantifier):
def __init__(self, classifier: BaseEstimator): def __init__(self, classifier: BaseEstimator):
self.classifier = classifier self.classifier = classifier
def classification_fit(self, data: LabelledCollection, fit_classifier=True): def aggregation_fit(self, classif_predictions: LabelledCollection):
"""
Trains the classifier unless `fit_classifier` is False, in which case, the classifier is assumed to
be already fit and there is nothing else to do.
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
:param fit_classifier: if False, the classifier is assumed to be fit
:return: self
"""
self.classifier, _ = _training_helper(self.classifier, data, fit_classifier)
return None
def aggregation_fit(self, classif_predictions: np.ndarray):
""" """
Nothing to do here! Nothing to do here!
@ -307,33 +266,11 @@ class ACC(AggregativeQuantifier):
self.val_split = val_split self.val_split = val_split
self.n_jobs = qp._get_njobs(n_jobs) self.n_jobs = qp._get_njobs(n_jobs)
def classification_fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None): def aggregation_fit(self, classif_predictions: LabelledCollection):
""" """
Trains the classifier and generates, optionally through a cross-validation procedure, the predictions Estimates the misclassification rates.
needed for estimating the misclassification rates matrix.
:param data: the training set :param classif_predictions: classifier predictions with true labels
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself, or an int indicating the number `k` of folds to be used in `k`-fold
cross validation to estimate the parameters
:return: self
"""
if val_split is None:
val_split = self.val_split
self.classifier, true_labels, pred_labels, classes, class_count = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=False, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
return (true_labels, pred_labels)
def aggregation_fit(self, classif_predictions):
"""
Nothing to do here!
:param classif_predictions: this is actually None
""" """
true_labels, pred_labels = classif_predictions true_labels, pred_labels = classif_predictions
self.cc = CC(self.classifier) self.cc = CC(self.classifier)
@ -393,11 +330,7 @@ class PCC(AggregativeProbabilisticQuantifier):
def __init__(self, classifier: BaseEstimator): def __init__(self, classifier: BaseEstimator):
self.classifier = classifier self.classifier = classifier
def classification_fit(self, data: LabelledCollection, fit_classifier=True): def aggregation_fit(self, classif_predictions: LabelledCollection):
self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True)
return self
def aggregation_fit(self, classif_predictions: np.ndarray):
""" """
Nothing to do here! Nothing to do here!
@ -429,33 +362,11 @@ class PACC(AggregativeProbabilisticQuantifier):
self.val_split = val_split self.val_split = val_split
self.n_jobs = qp._get_njobs(n_jobs) self.n_jobs = qp._get_njobs(n_jobs)
def classification_fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None): def aggregation_fit(self, classif_predictions: LabelledCollection):
""" """
Trains the soft classifier and generates, optionally through a cross-validation procedure, the posterior Estimates the misclassification rates
probabilities needed for estimating the misclassification rates matrix.
:param data: the training set :param classif_predictions: classifier predictions with true labels
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself, or an int indicating the number `k` of folds to be used in `k`-fold
cross validation to estimate the parameters
:return: self
"""
if val_split is None:
val_split = self.val_split
self.classifier, true_labels, posteriors, classes, class_count = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
return (true_labels, posteriors)
def aggregation_fit(self, classif_predictions):
"""
Nothing to do here!
:param classif_predictions: this is actually None
""" """
true_labels, posteriors = classif_predictions true_labels, posteriors = classif_predictions
self.pcc = PCC(self.classifier) self.pcc = PCC(self.classifier)
@ -509,7 +420,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
self.exact_train_prev = exact_train_prev self.exact_train_prev = exact_train_prev
self.recalib = recalib self.recalib = recalib
def classification_fit(self, data: LabelledCollection, fit_classifier=True): def classifier_fit_predict(self, data: LabelledCollection, fit_classifier=True):
self.classifier, true_labels, posteriors, classes, class_count = cross_generate_predictions( self.classifier, true_labels, posteriors, classes, class_count = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
) )
@ -842,7 +753,7 @@ class DMy(AggregativeProbabilisticQuantifier):
distributions = np.cumsum(distributions, axis=1) distributions = np.cumsum(distributions, axis=1)
return distributions return distributions
def classification_fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None): def classifier_fit_predict(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
""" """
Trains the classifier (if requested) and generates the validation distributions out of the training data. Trains the classifier (if requested) and generates the validation distributions out of the training data.
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of

View File

@ -63,7 +63,7 @@ def newOneVsAll(binary_quantifier, n_jobs=None):
return OneVsAllGeneric(binary_quantifier, n_jobs) return OneVsAllGeneric(binary_quantifier, n_jobs)
class OneVsAllGeneric(OneVsAll,BaseQuantifier): class OneVsAllGeneric(OneVsAll, BaseQuantifier):
""" """
Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1. quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1.