From 8ee5e499f52b2e588168cd36b00c920ce1e71cf4 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Mon, 14 Mar 2022 16:42:41 +0100 Subject: [PATCH 1/2] bugfix when the number of positive elemnts for one of the classes is 0 --- quapy/method/aggregative.py | 551 +++++++++--------------------------- 1 file changed, 130 insertions(+), 421 deletions(-) diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index c0280a2..59dd868 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -23,109 +23,49 @@ from quapy.method.base import BaseQuantifier, BinaryQuantifier class AggregativeQuantifier(BaseQuantifier): """ Abstract class for quantification methods that base their estimations on the aggregation of classification - results. Aggregative Quantifiers thus implement a :meth:`classify` method and maintain a :attr:`learner` attribute. - Subclasses of this abstract class must implement the method :meth:`aggregate` which computes the aggregation - of label predictions. The method :meth:`quantify` comes with a default implementation based on - :meth:`classify` and :meth:`aggregate`. + results. Aggregative Quantifiers thus implement a _classify_ method and maintain a _learner_ attribute. """ @abstractmethod - def fit(self, data: LabelledCollection, fit_learner=True): - """ - Trains the aggregative quantifier - - :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data - :param fit_learner: whether or not to train the learner (default is True). Set to False if the - learner has been trained outside the quantifier. - :return: self - """ - ... + def fit(self, data: LabelledCollection, fit_learner=True): ... @property def learner(self): - """ - Gives access to the classifier - - :return: the classifier (typically an sklearn's Estimator) - """ return self.learner_ @learner.setter - def learner(self, classifier): - """ - Setter for the classifier + def learner(self, value): + self.learner_ = value - :param classifier: the classifier - """ - self.learner_ = classifier + def preclassify(self, instances): + return self.classify(instances) def classify(self, instances): - """ - Provides the label predictions for the given instances. - - :param instances: array-like - :return: np.ndarray of shape `(n_instances,)` with label predictions - """ return self.learner.predict(instances) def quantify(self, instances): - """ - Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated - by the classifier. - - :param instances: array-like - :return: `np.ndarray` of shape `(self.n_classes_,)` with class prevalence estimates. - """ classif_predictions = self.classify(instances) return self.aggregate(classif_predictions) @abstractmethod - def aggregate(self, classif_predictions: np.ndarray): - """ - Implements the aggregation of label predictions. - - :param classif_predictions: `np.ndarray` of label predictions - :return: `np.ndarray` of shape `(self.n_classes_,)` with class prevalence estimates. - """ - ... + def aggregate(self, classif_predictions: np.ndarray): ... def get_params(self, deep=True): - """ - Return the current parameters of the quantifier. - - :param deep: for compatibility with sklearn - :return: a dictionary of param-value pairs - """ - return self.learner.get_params() def set_params(self, **parameters): - """ - Set the parameters of the quantifier. - - :param parameters: dictionary of param-value pairs - """ - self.learner.set_params(**parameters) @property - def classes_(self): - """ - Class labels, in the same order in which class prevalence values are to be computed. - This default implementation actually returns the class labels of the learner. + def n_classes(self): + return len(self.classes_) - :return: array-like - """ + @property + def classes_(self): return self.learner.classes_ @property def aggregative(self): - """ - Returns True, indicating the quantifier is of type aggregative. - - :return: True - """ - return True @@ -137,6 +77,9 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier): probabilities. """ + def preclassify(self, instances): + return self.predict_proba(instances) + def posterior_probabilities(self, instances): return self.learner.predict_proba(instances) @@ -159,24 +102,23 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier): # Helper # ------------------------------------ -def _training_helper(learner, - data: LabelledCollection, - fit_learner: bool = True, - ensure_probabilistic=False, - val_split: Union[LabelledCollection, float] = None): +def training_helper(learner, + data: LabelledCollection, + fit_learner: bool = True, + ensure_probabilistic=False, + val_split: Union[LabelledCollection, float] = None): """ Training procedure common to all Aggregative Quantifiers. - :param learner: the learner to be fit :param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner. :param fit_learner: whether or not to fit the learner (if False, then bypasses any action) :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the - learner is not probabilistic, then a CalibratedCV instance of it is trained) + learner is not probabilistic, then a CalibratedCV instance of it is trained) :param val_split: if specified as a float, indicates the proportion of training instances that will define the - validation split (e.g., 0.3 for using 30% of the training set as validation data); if specified as a - LabelledCollection, represents the validation split itself + validation split (e.g., 0.3 for using 30% of the training set as validation data); if specified as a + LabelledCollection, represents the validation split itself :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 - or None otherwise) to be used as a validation set for any subsequent parameter fitting + or None otherwise) to be used as a validation set for any subsequent parameter fitting """ if fit_learner: if ensure_probabilistic: @@ -218,10 +160,8 @@ def _training_helper(learner, # ------------------------------------ class CC(AggregativeQuantifier): """ - The most basic Quantification method. One that simply classifies all instances and counts how many have been - attributed to each of the classes in order to compute class prevalence estimates. - - :param learner: a sklearn's Estimator that generates a classifier + The most basic Quantification method. One that simply classifies all instances and countes how many have been + attributed each of the classes in order to compute class prevalence estimates. """ def __init__(self, learner: BaseEstimator): @@ -229,40 +169,19 @@ class CC(AggregativeQuantifier): def fit(self, data: LabelledCollection, fit_learner=True): """ - Trains the Classify & Count method unless `fit_learner` is False, in which case, the classifier is assumed to - be already fit and there is nothing else to do. - - :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + Trains the Classify & Count method unless _fit_learner_ is False, in which case it is assumed to be already fit. + :param data: training data :param fit_learner: if False, the classifier is assumed to be fit :return: self """ - self.learner, _ = _training_helper(self.learner, data, fit_learner) + self.learner, _ = training_helper(self.learner, data, fit_learner) return self - def aggregate(self, classif_predictions: np.ndarray): - """ - Computes class prevalence estimates by counting the prevalence of each of the predicted labels. - - :param classif_predictions: array-like with label predictions - :return: `np.ndarray` of shape `(self.n_classes_,)` with class prevalence estimates. - """ + def aggregate(self, classif_predictions): return F.prevalence_from_labels(classif_predictions, self.classes_) class ACC(AggregativeQuantifier): - """ - `Adjusted Classify & Count `_, - the "adjusted" variant of :class:`CC`, that corrects the predictions of CC - according to the `misclassification rates`. - - :param learner: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ def __init__(self, learner: BaseEstimator, val_split=0.4): self.learner = learner @@ -270,14 +189,13 @@ class ACC(AggregativeQuantifier): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): """ - Trains a ACC quantifier. - + Trains a ACC quantifier :param data: the training set :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param val_split: either a float in (0,1) indicating the proportion of training instances to use for - validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection - indicating the validation set itself, or an int indicating the number `k` of folds to be used in `k`-fold - cross validation to estimate the parameters + validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection + indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV + to estimate the parameters :return: self """ if val_split is None: @@ -293,7 +211,7 @@ class ACC(AggregativeQuantifier): pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') training = data.sampling_from_index(training_idx) validation = data.sampling_from_index(validation_idx) - learner, val_data = _training_helper(self.learner, training, fit_learner, val_split=validation) + learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation) y_.append(learner.predict(val_data.instances)) y.append(val_data.labels) @@ -302,22 +220,35 @@ class ACC(AggregativeQuantifier): class_count = data.counts() # fit the learner on all data - self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None) + self.learner, _ = training_helper(self.learner, data, fit_learner, val_split=None) else: - self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split) + self.learner, val_data = training_helper(self.learner, data, fit_learner, val_split=val_split) y_ = self.learner.predict(val_data.instances) y = val_data.labels - class_count = val_data.counts() self.cc = CC(self.learner) # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # document that belongs to yj ends up being classified as belonging to yi - self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count + self.Pte_cond_estim_ = self.getPteCondEstim(data.classes_, y, y_) return self + @classmethod + def getPteCondEstim(cls, classes, y, y_): + # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a + # document that belongs to yj ends up being classified as belonging to yi + conf = confusion_matrix(y, y_, labels=classes).T + conf = conf.astype(np.float) + class_counts = conf.sum(axis=0) + for i, _ in enumerate(classes): + if class_counts[i] == 0: + conf[i, i] = 1 + else: + conf[:, i] /= class_counts[i] + return conf + def classify(self, data): return self.cc.classify(data) @@ -327,15 +258,7 @@ class ACC(AggregativeQuantifier): @classmethod def solve_adjustment(cls, PteCondEstim, prevs_estim): - """ - Solves the system linear system :math:`Ax = B` with :math:`A` = `PteCondEstim` and :math:`B` = `prevs_estim` - - :param PteCondEstim: a `np.ndarray` of shape `(n_classes,n_classes,)` with entry `(i,j)` being the estimate - of :math:`P(y_i|y_j)`, that is, the probability that an instance that belongs to :math:`y_j` ends up being - classified as belonging to :math:`y_i` - :param prevs_estim: a `np.ndarray` of shape `(n_classes,)` with the class prevalence estimates - :return: an adjusted `np.ndarray` of shape `(n_classes,)` with the corrected class prevalence estimates - """ + # solve for the linear system Ax = B with A=PteCondEstim and B = prevs_estim A = PteCondEstim B = prevs_estim try: @@ -348,18 +271,11 @@ class ACC(AggregativeQuantifier): class PCC(AggregativeProbabilisticQuantifier): - """ - `Probabilistic Classify & Count `_, - the probabilistic variant of CC that relies on the posterior probabilities returned by a probabilistic classifier. - - :param learner: a sklearn's Estimator that generates a classifier - """ - def __init__(self, learner: BaseEstimator): self.learner = learner def fit(self, data: LabelledCollection, fit_learner=True): - self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) + self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) return self def aggregate(self, classif_posteriors): @@ -367,18 +283,6 @@ class PCC(AggregativeProbabilisticQuantifier): class PACC(AggregativeProbabilisticQuantifier): - """ - `Probabilistic Adjusted Classify & Count `_, - the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier. - - :param learner: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ def __init__(self, learner: BaseEstimator, val_split=0.4): self.learner = learner @@ -386,8 +290,7 @@ class PACC(AggregativeProbabilisticQuantifier): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): """ - Trains a PACC quantifier. - + Trains a PACC quantifier :param data: the training set :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param val_split: either a float in (0,1) indicating the proportion of training instances to use for @@ -410,7 +313,7 @@ class PACC(AggregativeProbabilisticQuantifier): pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') training = data.sampling_from_index(training_idx) validation = data.sampling_from_index(validation_idx) - learner, val_data = _training_helper( + learner, val_data = training_helper( self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation) y_.append(learner.predict_proba(val_data.instances)) y.append(val_data.labels) @@ -419,12 +322,12 @@ class PACC(AggregativeProbabilisticQuantifier): y_ = np.vstack(y_) # fit the learner on all data - self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, - val_split=None) + self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, + val_split=None) classes = data.classes_ else: - self.learner, val_data = _training_helper( + self.learner, val_data = training_helper( self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) y_ = self.learner.predict_proba(val_data.instances) y = val_data.labels @@ -432,16 +335,23 @@ class PACC(AggregativeProbabilisticQuantifier): self.pcc = PCC(self.learner) + self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_) + + return self + + @classmethod + def getPteCondEstim(cls, classes, y, y_): # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # document that belongs to yj ends up being classified as belonging to yi n_classes = len(classes) - confusion = np.empty(shape=(n_classes, n_classes)) + # confusion = np.zeros(shape=(n_classes, n_classes)) + confusion = np.eye(n_classes) for i, class_ in enumerate(classes): - confusion[i] = y_[y == class_].mean(axis=0) + idx = y == class_ + if idx.any(): + confusion[i] = y_[idx].mean(axis=0) - self.Pte_cond_estim_ = confusion.T - - return self + return confusion.T def aggregate(self, classif_posteriors): prevs_estim = self.pcc.aggregate(classif_posteriors) @@ -453,13 +363,10 @@ class PACC(AggregativeProbabilisticQuantifier): class EMQ(AggregativeProbabilisticQuantifier): """ - `Expectation Maximization for Quantification `_ (EMQ), - aka `Saerens-Latinne-Decaestecker` (SLD) algorithm. - EMQ consists of using the well-known `Expectation Maximization algorithm` to iteratively update the posterior - probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via - maximum-likelihood estimation, in a mutually recursive way, until convergence. - - :param learner: a sklearn's Estimator that generates a classifier + The method is described in: + Saerens, M., Latinne, P., and Decaestecker, C. (2002). + Adjusting the outputs of a classifier to new a priori probabilities: A simple procedure. + Neural Computation, 14(1): 21–41. """ MAX_ITER = 1000 @@ -469,7 +376,7 @@ class EMQ(AggregativeProbabilisticQuantifier): self.learner = learner def fit(self, data: LabelledCollection, fit_learner=True): - self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) + self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_) return self @@ -484,17 +391,6 @@ class EMQ(AggregativeProbabilisticQuantifier): @classmethod def EM(cls, tr_prev, posterior_probabilities, epsilon=EPSILON): - """ - Computes the `Expectation Maximization` routine. - - :param tr_prev: array-like, the training prevalence - :param posterior_probabilities: `np.ndarray` of shape `(n_instances, n_classes,)` with the - posterior probabilities - :param epsilon: float, the threshold different between two consecutive iterations - to reach before stopping the loop - :return: a tuple with the estimated prevalence values (shape `(n_classes,)`) and - the corrected posterior probabilities (shape `(n_instances, n_classes,)`) - """ Px = posterior_probabilities Ptr = np.copy(tr_prev) qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence @@ -523,17 +419,9 @@ class EMQ(AggregativeProbabilisticQuantifier): class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): """ - `Hellinger Distance y `_ (HDy). - HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of - minimizing the divergence (in terms of the Hellinger Distance) between two cumulative distributions of posterior - probabilities returned by the classifier. One of the distributions is generated from the unlabelled examples and - the other is generated from a validation set. This latter distribution is defined as a mixture of the - class-conditional distributions of the posterior probabilities returned for the positive and negative validation - examples, respectively. The parameters of the mixture thus represent the estimates of the class prevalence values. - - :param learner: a sklearn's Estimator that generates a binary classifier - :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out - validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself). + Implementation of the method based on the Hellinger Distance y (HDy) proposed by + González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution + estimation based on the Hellinger distance. Information Sciences, 218:146–164. """ def __init__(self, learner: BaseEstimator, val_split=0.4): @@ -542,20 +430,19 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None): """ - Trains a HDy quantifier. - + Trains a HDy quantifier :param data: the training set :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param val_split: either a float in (0,1) indicating the proportion of training instances to use for - validation (e.g., 0.3 for using 30% of the training set as validation data), or a - :class:`quapy.data.base.LabelledCollection` indicating the validation set itself + validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection + indicating the validation set itself :return: self """ if val_split is None: val_split = self.val_split self._check_binary(data, self.__class__.__name__) - self.learner, validation = _training_helper( + self.learner, validation = training_helper( self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) Px = self.posterior_probabilities(validation.instances)[:, 1] # takes only the P(y=+1|x) self.Pxy1 = Px[validation.labels == self.learner.classes_[1]] @@ -586,7 +473,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True) prev_selected, min_dist = None, None - for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0): + for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0): Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density hdy = F.HellingerDistance(Px_train, Px_test) if prev_selected is None or hdy < min_dist: @@ -598,19 +485,6 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): class ELM(AggregativeQuantifier, BinaryQuantifier): - """ - Class of Explicit Loss Minimization (ELM) quantifiers. - Quantifiers based on ELM represent a family of methods based on structured output learning; - these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss - measure. This implementation relies on - `Joachims’ SVM perf `_ structured output - learning algorithm, which has to be installed and patched for the purpose (see this - `script `_). - - :param svmperf_base: path to the folder containing the binary files of `SVM perf` - :param loss: the loss to optimize (see :attr:`quapy.classification.svmperf.SVMperf.valid_losses`) - :param kwargs: rest of SVM perf's parameters - """ def __init__(self, svmperf_base=None, loss='01', **kwargs): self.svmperf_base = svmperf_base if svmperf_base is not None else qp.environ['SVMPERF_HOME'] @@ -633,15 +507,9 @@ class ELM(AggregativeQuantifier, BinaryQuantifier): class SVMQ(ELM): """ - SVM(Q), which attempts to minimize the `Q` loss combining a classification-oriented loss and a - quantification-oriented loss, as proposed by - `Barranquero et al. 2015 `_. - Equivalent to: - - >>> ELM(svmperf_base, loss='q', **kwargs) - - :param svmperf_base: path to the folder containing the binary files of `SVM perf` - :param kwargs: rest of SVM perf's parameters + Barranquero, J., Díez, J., and del Coz, J. J. (2015). + Quantification-oriented learning based on reliable classifiers. + Pattern Recognition, 48(2):591–604. """ def __init__(self, svmperf_base=None, **kwargs): @@ -650,14 +518,9 @@ class SVMQ(ELM): class SVMKLD(ELM): """ - SVM(KLD), which attempts to minimize the Kullback-Leibler Divergence as proposed by - `Esuli et al. 2015 `_. - Equivalent to: - - >>> ELM(svmperf_base, loss='kld', **kwargs) - - :param svmperf_base: path to the folder containing the binary files of `SVM perf` - :param kwargs: rest of SVM perf's parameters + Esuli, A. and Sebastiani, F. (2015). + Optimizing text quantifiers for multivariate loss functions. + ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27. """ def __init__(self, svmperf_base=None, **kwargs): @@ -666,15 +529,9 @@ class SVMKLD(ELM): class SVMNKLD(ELM): """ - SVM(NKLD), which attempts to minimize a version of the the Kullback-Leibler Divergence normalized - via the logistic function, as proposed by - `Esuli et al. 2015 `_. - Equivalent to: - - >>> ELM(svmperf_base, loss='nkld', **kwargs) - - :param svmperf_base: path to the folder containing the binary files of `SVM perf` - :param kwargs: rest of SVM perf's parameters + Esuli, A. and Sebastiani, F. (2015). + Optimizing text quantifiers for multivariate loss functions. + ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27. """ def __init__(self, svmperf_base=None, **kwargs): @@ -682,60 +539,25 @@ class SVMNKLD(ELM): class SVMAE(ELM): - """ - SVM(AE), which attempts to minimize Absolute Error as first used by - `Moreo and Sebastiani, 2021 `_. - Equivalent to: - - >>> ELM(svmperf_base, loss='mae', **kwargs) - - :param svmperf_base: path to the folder containing the binary files of `SVM perf` - :param kwargs: rest of SVM perf's parameters - """ - def __init__(self, svmperf_base=None, **kwargs): super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs) class SVMRAE(ELM): - """ - SVM(RAE), which attempts to minimize Relative Absolute Error as first used by - `Moreo and Sebastiani, 2021 `_. - Equivalent to: - - >>> ELM(svmperf_base, loss='mrae', **kwargs) - - :param svmperf_base: path to the folder containing the binary files of `SVM perf` - :param kwargs: rest of SVM perf's parameters - """ - def __init__(self, svmperf_base=None, **kwargs): super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs) class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier): - """ - Abstract class of Threshold Optimization variants for :class:`ACC` as proposed by - `Forman 2006 `_ and - `Forman 2008 `_. - The goal is to bring improved stability to the denominator of the adjustment. - The different variants are based on different heuristics for choosing a decision threshold - that would allow for more true positives and many more false positives, on the grounds this - would deliver larger denominators. - - :param learner: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ def __init__(self, learner: BaseEstimator, val_split=0.4): self.learner = learner self.val_split = val_split + @abstractmethod + def optimize_threshold(self, y, probabilities): + ... + def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): self._check_binary(data, "Threshold Optimization") @@ -752,7 +574,7 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier): pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') training = data.sampling_from_index(training_idx) validation = data.sampling_from_index(validation_idx) - learner, val_data = _training_helper(self.learner, training, fit_learner, val_split=validation) + learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation) probabilities.append(learner.predict_proba(val_data.instances)) y.append(val_data.labels) @@ -760,16 +582,16 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier): probabilities = np.concatenate(probabilities) # fit the learner on all data - self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None) + self.learner, _ = training_helper(self.learner, data, fit_learner, val_split=None) else: - self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split) + self.learner, val_data = training_helper(self.learner, data, fit_learner, val_split=val_split) probabilities = self.learner.predict_proba(val_data.instances) y = val_data.labels self.cc = CC(self.learner) - self.tpr, self.fpr = self._optimize_threshold(y, probabilities) + self.tpr, self.fpr = self.optimize_threshold(y, probabilities) return self @@ -777,32 +599,20 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier): def _condition(self, tpr, fpr) -> float: """ Implements the criterion according to which the threshold should be selected. - This function should return the (float) score to be minimized. - - :param tpr: float, true positive rate - :param fpr: float, false positive rate - :return: float, a score for the given `tpr` and `fpr` + This function should return a (float) score to be minimized. """ ... - def _optimize_threshold(self, y, probabilities): - """ - Seeks for the best `tpr` and `fpr` according to the score obtained at different - decision thresholds. The scoring function is implemented in function `_condition`. - - :param y: predicted labels for the validation set (or for the training set via `k`-fold cross validation) - :param probabilities: array-like with the posterior probabilities - :return: best `tpr` and `fpr` according to `_condition` - """ + def optimize_threshold(self, y, probabilities): best_candidate_threshold_score = None best_tpr = 0 best_fpr = 0 candidate_thresholds = np.unique(probabilities[:, 1]) for candidate_threshold in candidate_thresholds: y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]] - TP, FP, FN, TN = self._compute_table(y, y_) - tpr = self._compute_tpr(TP, FP) - fpr = self._compute_fpr(FP, TN) + TP, FP, FN, TN = self.compute_table(y, y_) + tpr = self.compute_tpr(TP, FP) + fpr = self.compute_fpr(FP, TN) condition_score = self._condition(tpr, fpr) if best_candidate_threshold_score is None or condition_score < best_candidate_threshold_score: best_candidate_threshold_score = condition_score @@ -819,40 +629,25 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier): adjusted_prevs_estim = np.array((1 - adjusted_prevs_estim, adjusted_prevs_estim)) return adjusted_prevs_estim - def _compute_table(self, y, y_): + def compute_table(self, y, y_): TP = np.logical_and(y == y_, y == self.classes_[1]).sum() FP = np.logical_and(y != y_, y == self.classes_[0]).sum() FN = np.logical_and(y != y_, y == self.classes_[1]).sum() TN = np.logical_and(y == y_, y == self.classes_[0]).sum() return TP, FP, FN, TN - def _compute_tpr(self, TP, FP): + def compute_tpr(self, TP, FP): if TP + FP == 0: return 0 return TP / (TP + FP) - def _compute_fpr(self, FP, TN): + def compute_fpr(self, FP, TN): if FP + TN == 0: return 0 return FP / (FP + TN) class T50(ThresholdOptimization): - """ - Threshold Optimization variant for :class:`ACC` as proposed by - `Forman 2006 `_ and - `Forman 2008 `_ that looks - for the threshold that makes `tpr` cosest to 0.5. - The goal is to bring improved stability to the denominator of the adjustment. - - :param learner: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ def __init__(self, learner: BaseEstimator, val_split=0.4): super().__init__(learner, val_split) @@ -862,21 +657,6 @@ class T50(ThresholdOptimization): class MAX(ThresholdOptimization): - """ - Threshold Optimization variant for :class:`ACC` as proposed by - `Forman 2006 `_ and - `Forman 2008 `_ that looks - for the threshold that maximizes `tpr-fpr`. - The goal is to bring improved stability to the denominator of the adjustment. - - :param learner: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ def __init__(self, learner: BaseEstimator, val_split=0.4): super().__init__(learner, val_split) @@ -887,21 +667,6 @@ class MAX(ThresholdOptimization): class X(ThresholdOptimization): - """ - Threshold Optimization variant for :class:`ACC` as proposed by - `Forman 2006 `_ and - `Forman 2008 `_ that looks - for the threshold that yields `tpr=1-fpr`. - The goal is to bring improved stability to the denominator of the adjustment. - - :param learner: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ def __init__(self, learner: BaseEstimator, val_split=0.4): super().__init__(learner, val_split) @@ -911,70 +676,41 @@ class X(ThresholdOptimization): class MS(ThresholdOptimization): - """ - Median Sweep. Threshold Optimization variant for :class:`ACC` as proposed by - `Forman 2006 `_ and - `Forman 2008 `_ that generates - class prevalence estimates for all decision thresholds and returns the median of them all. - The goal is to bring improved stability to the denominator of the adjustment. - :param learner: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ def __init__(self, learner: BaseEstimator, val_split=0.4): super().__init__(learner, val_split) def _condition(self, tpr, fpr) -> float: pass - def _optimize_threshold(self, y, probabilities): + def optimize_threshold(self, y, probabilities): tprs = [] fprs = [] candidate_thresholds = np.unique(probabilities[:, 1]) for candidate_threshold in candidate_thresholds: y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]] - TP, FP, FN, TN = self._compute_table(y, y_) - tpr = self._compute_tpr(TP, FP) - fpr = self._compute_fpr(FP, TN) + TP, FP, FN, TN = self.compute_table(y, y_) + tpr = self.compute_tpr(TP, FP) + fpr = self.compute_fpr(FP, TN) tprs.append(tpr) fprs.append(fpr) return np.median(tprs), np.median(fprs) class MS2(MS): - """ - Median Sweep 2. Threshold Optimization variant for :class:`ACC` as proposed by - `Forman 2006 `_ and - `Forman 2008 `_ that generates - class prevalence estimates for all decision thresholds and returns the median of for cases in - which `tpr-fpr>0.25` - The goal is to bring improved stability to the denominator of the adjustment. - :param learner: a sklearn's Estimator that generates a classifier - :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the - misclassification rates are to be estimated. - This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of - validation data, or as an integer, indicating that the misclassification rates should be estimated via - `k`-fold cross validation (this integer stands for the number of folds `k`), or as a - :class:`quapy.data.base.LabelledCollection` (the split itself). - """ def __init__(self, learner: BaseEstimator, val_split=0.4): super().__init__(learner, val_split) - def _optimize_threshold(self, y, probabilities): + def optimize_threshold(self, y, probabilities): tprs = [0, 1] fprs = [0, 1] candidate_thresholds = np.unique(probabilities[:, 1]) for candidate_threshold in candidate_thresholds: y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]] - TP, FP, FN, TN = self._compute_table(y, y_) - tpr = self._compute_tpr(TP, FP) - fpr = self._compute_fpr(FP, TN) + TP, FP, FN, TN = self.compute_table(y, y_) + tpr = self.compute_tpr(TP, FP) + fpr = self.compute_fpr(FP, TN) if (tpr - fpr) > 0.25: tprs.append(tpr) fprs.append(fpr) @@ -986,7 +722,6 @@ AdjustedClassifyAndCount = ACC ProbabilisticClassifyAndCount = PCC ProbabilisticAdjustedClassifyAndCount = PACC ExpectationMaximizationQuantifier = EMQ -SLD = EMQ HellingerDistanceY = HDy ExplicitLossMinimisation = ELM MedianSweep = MS @@ -995,14 +730,11 @@ MedianSweep2 = MS2 class OneVsAll(AggregativeQuantifier): """ - Allows any binary quantifier to perform quantification on single-label datasets. - The method maintains one binary quantifier for each class, and then l1-normalizes the outputs so that the - class prevelences sum up to 1. - This variant was used, along with the :class:`EMQ` quantifier, in - `Gao and Sebastiani, 2016 `_. - - :param learner: a sklearn's Estimator that generates a binary classifier - :param n_jobs: number of parallel workers + Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary + quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1. + This variant was used, along with the ExplicitLossMinimization quantifier in + Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. + Social Network Analysis and Mining 6(19), 1–22 (2016) """ def __init__(self, binary_quantifier, n_jobs=-1): @@ -1021,30 +753,18 @@ class OneVsAll(AggregativeQuantifier): return self def classify(self, instances): - """ - Returns a matrix of shape `(n,m,)` with `n` the number of instances and `m` the number of classes. The entry - `(i,j)` is a binary value indicating whether instance `i `belongs to class `j`. The binary classifications are - independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes. - - :param instances: array-like - :return: `np.ndarray` - """ - + # returns a matrix of shape (n,m) with n the number of instances and m the number of classes. The entry + # (i,j) is a binary value indicating whether instance i belongs to class j. The binary classifications are + # independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes. classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances) return classif_predictions_bin.T def posterior_probabilities(self, instances): - """ - Returns a matrix of shape `(n,m,2)` with `n` the number of instances and `m` the number of classes. The entry - `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the posterior probability that instance `i` belongs - (resp. does not belong) to class `j`. - The posterior probabilities are independent of each other, meaning that, in general, they do not sum - up to one. - - :param instances: array-like - :return: `np.ndarray` - """ - + # returns a matrix of shape (n,m,2) with n the number of instances and m the number of classes. The entry + # (i,j,1) (resp. (i,j,0)) is a value in [0,1] indicating the posterior probability that instance i belongs + # (resp. does not belong) to class j. + # The posterior probabilities are independent of each other, meaning that, in general, they do not sum + # up to one. if not self.binary_quantifier.probabilistic: raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because ' f'the base quantifier {self.binary_quantifier.__class__.__name__} is not ' @@ -1091,7 +811,7 @@ class OneVsAll(AggregativeQuantifier): return self.binary_quantifier.get_params() def _delayed_binary_classification(self, c, X): - return self.dict_binary_quantifiers[c].classify(X) + return self.dict_binary_quantifiers[c].preclassify(X) def _delayed_binary_posteriors(self, c, X): return self.dict_binary_quantifiers[c].posterior_probabilities(X) @@ -1106,19 +826,8 @@ class OneVsAll(AggregativeQuantifier): @property def binary(self): - """ - Informs that the classifier is not binary - - :return: False - """ return False @property def probabilistic(self): - """ - Indicates if the classifier is probabilistic or not (depending on the nature of the base classifier). - - :return: boolean - """ - return self.binary_quantifier.probabilistic From fa577abdd217848a5eef3f4742bc4fd5229b877c Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Tue, 15 Mar 2022 14:16:37 +0100 Subject: [PATCH 2/2] merging from pool request and adding documentation --- quapy/method/aggregative.py | 547 +++++++++++++++++++++++++++++------- 1 file changed, 439 insertions(+), 108 deletions(-) diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 59dd868..19969c6 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -23,49 +23,109 @@ from quapy.method.base import BaseQuantifier, BinaryQuantifier class AggregativeQuantifier(BaseQuantifier): """ Abstract class for quantification methods that base their estimations on the aggregation of classification - results. Aggregative Quantifiers thus implement a _classify_ method and maintain a _learner_ attribute. + results. Aggregative Quantifiers thus implement a :meth:`classify` method and maintain a :attr:`learner` attribute. + Subclasses of this abstract class must implement the method :meth:`aggregate` which computes the aggregation + of label predictions. The method :meth:`quantify` comes with a default implementation based on + :meth:`classify` and :meth:`aggregate`. """ @abstractmethod - def fit(self, data: LabelledCollection, fit_learner=True): ... + def fit(self, data: LabelledCollection, fit_learner=True): + """ + Trains the aggregative quantifier + + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :param fit_learner: whether or not to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :return: self + """ + ... @property def learner(self): + """ + Gives access to the classifier + + :return: the classifier (typically an sklearn's Estimator) + """ return self.learner_ @learner.setter - def learner(self, value): - self.learner_ = value + def learner(self, classifier): + """ + Setter for the classifier - def preclassify(self, instances): - return self.classify(instances) + :param classifier: the classifier + """ + self.learner_ = classifier def classify(self, instances): + """ + Provides the label predictions for the given instances. + + :param instances: array-like + :return: np.ndarray of shape `(n_instances,)` with label predictions + """ return self.learner.predict(instances) def quantify(self, instances): + """ + Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated + by the classifier. + + :param instances: array-like + :return: `np.ndarray` of shape `(self.n_classes_,)` with class prevalence estimates. + """ classif_predictions = self.classify(instances) return self.aggregate(classif_predictions) @abstractmethod - def aggregate(self, classif_predictions: np.ndarray): ... + def aggregate(self, classif_predictions: np.ndarray): + """ + Implements the aggregation of label predictions. + + :param classif_predictions: `np.ndarray` of label predictions + :return: `np.ndarray` of shape `(self.n_classes_,)` with class prevalence estimates. + """ + ... def get_params(self, deep=True): + """ + Return the current parameters of the quantifier. + + :param deep: for compatibility with sklearn + :return: a dictionary of param-value pairs + """ + return self.learner.get_params() def set_params(self, **parameters): + """ + Set the parameters of the quantifier. + + :param parameters: dictionary of param-value pairs + """ + self.learner.set_params(**parameters) - @property - def n_classes(self): - return len(self.classes_) - @property def classes_(self): + """ + Class labels, in the same order in which class prevalence values are to be computed. + This default implementation actually returns the class labels of the learner. + + :return: array-like + """ return self.learner.classes_ @property def aggregative(self): + """ + Returns True, indicating the quantifier is of type aggregative. + + :return: True + """ + return True @@ -77,9 +137,6 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier): probabilities. """ - def preclassify(self, instances): - return self.predict_proba(instances) - def posterior_probabilities(self, instances): return self.learner.predict_proba(instances) @@ -102,23 +159,24 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier): # Helper # ------------------------------------ -def training_helper(learner, - data: LabelledCollection, - fit_learner: bool = True, - ensure_probabilistic=False, - val_split: Union[LabelledCollection, float] = None): +def _training_helper(learner, + data: LabelledCollection, + fit_learner: bool = True, + ensure_probabilistic=False, + val_split: Union[LabelledCollection, float] = None): """ Training procedure common to all Aggregative Quantifiers. + :param learner: the learner to be fit :param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner. :param fit_learner: whether or not to fit the learner (if False, then bypasses any action) :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the - learner is not probabilistic, then a CalibratedCV instance of it is trained) + learner is not probabilistic, then a CalibratedCV instance of it is trained) :param val_split: if specified as a float, indicates the proportion of training instances that will define the - validation split (e.g., 0.3 for using 30% of the training set as validation data); if specified as a - LabelledCollection, represents the validation split itself + validation split (e.g., 0.3 for using 30% of the training set as validation data); if specified as a + LabelledCollection, represents the validation split itself :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 - or None otherwise) to be used as a validation set for any subsequent parameter fitting + or None otherwise) to be used as a validation set for any subsequent parameter fitting """ if fit_learner: if ensure_probabilistic: @@ -160,8 +218,10 @@ def training_helper(learner, # ------------------------------------ class CC(AggregativeQuantifier): """ - The most basic Quantification method. One that simply classifies all instances and countes how many have been - attributed each of the classes in order to compute class prevalence estimates. + The most basic Quantification method. One that simply classifies all instances and counts how many have been + attributed to each of the classes in order to compute class prevalence estimates. + + :param learner: a sklearn's Estimator that generates a classifier """ def __init__(self, learner: BaseEstimator): @@ -169,19 +229,40 @@ class CC(AggregativeQuantifier): def fit(self, data: LabelledCollection, fit_learner=True): """ - Trains the Classify & Count method unless _fit_learner_ is False, in which case it is assumed to be already fit. - :param data: training data + Trains the Classify & Count method unless `fit_learner` is False, in which case, the classifier is assumed to + be already fit and there is nothing else to do. + + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data :param fit_learner: if False, the classifier is assumed to be fit :return: self """ - self.learner, _ = training_helper(self.learner, data, fit_learner) + self.learner, _ = _training_helper(self.learner, data, fit_learner) return self - def aggregate(self, classif_predictions): + def aggregate(self, classif_predictions: np.ndarray): + """ + Computes class prevalence estimates by counting the prevalence of each of the predicted labels. + + :param classif_predictions: array-like with label predictions + :return: `np.ndarray` of shape `(self.n_classes_,)` with class prevalence estimates. + """ return F.prevalence_from_labels(classif_predictions, self.classes_) class ACC(AggregativeQuantifier): + """ + `Adjusted Classify & Count `_, + the "adjusted" variant of :class:`CC`, that corrects the predictions of CC + according to the `misclassification rates`. + + :param learner: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ def __init__(self, learner: BaseEstimator, val_split=0.4): self.learner = learner @@ -189,13 +270,14 @@ class ACC(AggregativeQuantifier): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): """ - Trains a ACC quantifier + Trains a ACC quantifier. + :param data: the training set :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param val_split: either a float in (0,1) indicating the proportion of training instances to use for - validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection - indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV - to estimate the parameters + validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection + indicating the validation set itself, or an int indicating the number `k` of folds to be used in `k`-fold + cross validation to estimate the parameters :return: self """ if val_split is None: @@ -211,7 +293,7 @@ class ACC(AggregativeQuantifier): pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') training = data.sampling_from_index(training_idx) validation = data.sampling_from_index(validation_idx) - learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation) + learner, val_data = _training_helper(self.learner, training, fit_learner, val_split=validation) y_.append(learner.predict(val_data.instances)) y.append(val_data.labels) @@ -220,17 +302,15 @@ class ACC(AggregativeQuantifier): class_count = data.counts() # fit the learner on all data - self.learner, _ = training_helper(self.learner, data, fit_learner, val_split=None) + self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None) else: - self.learner, val_data = training_helper(self.learner, data, fit_learner, val_split=val_split) + self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split) y_ = self.learner.predict(val_data.instances) y = val_data.labels self.cc = CC(self.learner) - # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a - # document that belongs to yj ends up being classified as belonging to yi self.Pte_cond_estim_ = self.getPteCondEstim(data.classes_, y, y_) return self @@ -258,7 +338,15 @@ class ACC(AggregativeQuantifier): @classmethod def solve_adjustment(cls, PteCondEstim, prevs_estim): - # solve for the linear system Ax = B with A=PteCondEstim and B = prevs_estim + """ + Solves the system linear system :math:`Ax = B` with :math:`A` = `PteCondEstim` and :math:`B` = `prevs_estim` + + :param PteCondEstim: a `np.ndarray` of shape `(n_classes,n_classes,)` with entry `(i,j)` being the estimate + of :math:`P(y_i|y_j)`, that is, the probability that an instance that belongs to :math:`y_j` ends up being + classified as belonging to :math:`y_i` + :param prevs_estim: a `np.ndarray` of shape `(n_classes,)` with the class prevalence estimates + :return: an adjusted `np.ndarray` of shape `(n_classes,)` with the corrected class prevalence estimates + """ A = PteCondEstim B = prevs_estim try: @@ -271,11 +359,18 @@ class ACC(AggregativeQuantifier): class PCC(AggregativeProbabilisticQuantifier): + """ + `Probabilistic Classify & Count `_, + the probabilistic variant of CC that relies on the posterior probabilities returned by a probabilistic classifier. + + :param learner: a sklearn's Estimator that generates a classifier + """ + def __init__(self, learner: BaseEstimator): self.learner = learner def fit(self, data: LabelledCollection, fit_learner=True): - self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) + self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) return self def aggregate(self, classif_posteriors): @@ -283,6 +378,18 @@ class PCC(AggregativeProbabilisticQuantifier): class PACC(AggregativeProbabilisticQuantifier): + """ + `Probabilistic Adjusted Classify & Count `_, + the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier. + + :param learner: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ def __init__(self, learner: BaseEstimator, val_split=0.4): self.learner = learner @@ -290,7 +397,8 @@ class PACC(AggregativeProbabilisticQuantifier): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): """ - Trains a PACC quantifier + Trains a PACC quantifier. + :param data: the training set :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param val_split: either a float in (0,1) indicating the proportion of training instances to use for @@ -313,7 +421,7 @@ class PACC(AggregativeProbabilisticQuantifier): pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') training = data.sampling_from_index(training_idx) validation = data.sampling_from_index(validation_idx) - learner, val_data = training_helper( + learner, val_data = _training_helper( self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation) y_.append(learner.predict_proba(val_data.instances)) y.append(val_data.labels) @@ -322,12 +430,12 @@ class PACC(AggregativeProbabilisticQuantifier): y_ = np.vstack(y_) # fit the learner on all data - self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, - val_split=None) + self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, + val_split=None) classes = data.classes_ else: - self.learner, val_data = training_helper( + self.learner, val_data = _training_helper( self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) y_ = self.learner.predict_proba(val_data.instances) y = val_data.labels @@ -335,6 +443,13 @@ class PACC(AggregativeProbabilisticQuantifier): self.pcc = PCC(self.learner) + # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a + # document that belongs to yj ends up being classified as belonging to yi + n_classes = len(classes) + confusion = np.empty(shape=(n_classes, n_classes)) + for i, class_ in enumerate(classes): + confusion[i] = y_[y == class_].mean(axis=0) + self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_) return self @@ -344,7 +459,6 @@ class PACC(AggregativeProbabilisticQuantifier): # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # document that belongs to yj ends up being classified as belonging to yi n_classes = len(classes) - # confusion = np.zeros(shape=(n_classes, n_classes)) confusion = np.eye(n_classes) for i, class_ in enumerate(classes): idx = y == class_ @@ -363,34 +477,59 @@ class PACC(AggregativeProbabilisticQuantifier): class EMQ(AggregativeProbabilisticQuantifier): """ - The method is described in: + `Expectation Maximization for Quantification `_ (EMQ), + aka `Saerens-Latinne-Decaestecker` (SLD) algorithm. + EMQ consists of using the well-known `Expectation Maximization algorithm` to iteratively update the posterior + probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via + maximum-likelihood estimation, in a mutually recursive way, until convergence. + The `transform_prior` callback allows you to introduce ad-hoc regularizations which are not part of the + original EMQ algorithm. This callback can, for instance, enhance or diminish small class prevalences if + sparse or dense solutions should be promoted. + + The original method is described in: Saerens, M., Latinne, P., and Decaestecker, C. (2002). Adjusting the outputs of a classifier to new a priori probabilities: A simple procedure. Neural Computation, 14(1): 21–41. + + :param learner: a sklearn's Estimator that generates a classifier + :param transform_prior: an optional function :math:`R^c -> R^c` that transforms each intermediate estimate """ MAX_ITER = 1000 EPSILON = 1e-4 - def __init__(self, learner: BaseEstimator): + def __init__(self, learner: BaseEstimator, transform_prior=None): self.learner = learner + self.transform_prior = transform_prior def fit(self, data: LabelledCollection, fit_learner=True): - self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) + self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_) return self def aggregate(self, classif_posteriors, epsilon=EPSILON): - priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon) + priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon, self.transform_prior) return priors def predict_proba(self, instances, epsilon=EPSILON): classif_posteriors = self.learner.predict_proba(instances) - priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon) + priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon, self.transform_prior) return posteriors @classmethod - def EM(cls, tr_prev, posterior_probabilities, epsilon=EPSILON): + def EM(cls, tr_prev, posterior_probabilities, epsilon=EPSILON, transform_prior=None): + """ + Computes the `Expectation Maximization` routine. + :param tr_prev: array-like, the training prevalence + :param posterior_probabilities: `np.ndarray` of shape `(n_instances, n_classes,)` with the + posterior probabilities + :param epsilon: float, the threshold different between two consecutive iterations + to reach before stopping the loop + :param transform_prior: an optional function :math:`R^c -> R^c` that transforms each intermediate estimate + :return: a tuple with the estimated prevalence values (shape `(n_classes,)`) and + the corrected posterior probabilities (shape `(n_instances, n_classes,)`) + """ + Px = posterior_probabilities Ptr = np.copy(tr_prev) qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence @@ -411,17 +550,30 @@ class EMQ(AggregativeProbabilisticQuantifier): qs_prev_ = qs s += 1 + # transformation of intermediate estimates + if transform_prior is not None and not converged: + qs = transform_prior(qs) + if not converged: print('[warning] the method has reached the maximum number of iterations; it might have not converged') return qs, ps + class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): """ - Implementation of the method based on the Hellinger Distance y (HDy) proposed by - González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution - estimation based on the Hellinger distance. Information Sciences, 218:146–164. + `Hellinger Distance y `_ (HDy). + HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of + minimizing the divergence (in terms of the Hellinger Distance) between two cumulative distributions of posterior + probabilities returned by the classifier. One of the distributions is generated from the unlabelled examples and + the other is generated from a validation set. This latter distribution is defined as a mixture of the + class-conditional distributions of the posterior probabilities returned for the positive and negative validation + examples, respectively. The parameters of the mixture thus represent the estimates of the class prevalence values. + + :param learner: a sklearn's Estimator that generates a binary classifier + :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out + validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself). """ def __init__(self, learner: BaseEstimator, val_split=0.4): @@ -430,19 +582,20 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None): """ - Trains a HDy quantifier + Trains a HDy quantifier. + :param data: the training set :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param val_split: either a float in (0,1) indicating the proportion of training instances to use for - validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection - indicating the validation set itself + validation (e.g., 0.3 for using 30% of the training set as validation data), or a + :class:`quapy.data.base.LabelledCollection` indicating the validation set itself :return: self """ if val_split is None: val_split = self.val_split self._check_binary(data, self.__class__.__name__) - self.learner, validation = training_helper( + self.learner, validation = _training_helper( self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) Px = self.posterior_probabilities(validation.instances)[:, 1] # takes only the P(y=+1|x) self.Pxy1 = Px[validation.labels == self.learner.classes_[1]] @@ -473,7 +626,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True) prev_selected, min_dist = None, None - for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0): + for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0): Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density hdy = F.HellingerDistance(Px_train, Px_test) if prev_selected is None or hdy < min_dist: @@ -485,6 +638,19 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): class ELM(AggregativeQuantifier, BinaryQuantifier): + """ + Class of Explicit Loss Minimization (ELM) quantifiers. + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf `_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script `_). + + :param svmperf_base: path to the folder containing the binary files of `SVM perf` + :param loss: the loss to optimize (see :attr:`quapy.classification.svmperf.SVMperf.valid_losses`) + :param kwargs: rest of SVM perf's parameters + """ def __init__(self, svmperf_base=None, loss='01', **kwargs): self.svmperf_base = svmperf_base if svmperf_base is not None else qp.environ['SVMPERF_HOME'] @@ -507,9 +673,15 @@ class ELM(AggregativeQuantifier, BinaryQuantifier): class SVMQ(ELM): """ - Barranquero, J., Díez, J., and del Coz, J. J. (2015). - Quantification-oriented learning based on reliable classifiers. - Pattern Recognition, 48(2):591–604. + SVM(Q), which attempts to minimize the `Q` loss combining a classification-oriented loss and a + quantification-oriented loss, as proposed by + `Barranquero et al. 2015 `_. + Equivalent to: + + >>> ELM(svmperf_base, loss='q', **kwargs) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf` + :param kwargs: rest of SVM perf's parameters """ def __init__(self, svmperf_base=None, **kwargs): @@ -518,9 +690,14 @@ class SVMQ(ELM): class SVMKLD(ELM): """ - Esuli, A. and Sebastiani, F. (2015). - Optimizing text quantifiers for multivariate loss functions. - ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27. + SVM(KLD), which attempts to minimize the Kullback-Leibler Divergence as proposed by + `Esuli et al. 2015 `_. + Equivalent to: + + >>> ELM(svmperf_base, loss='kld', **kwargs) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf` + :param kwargs: rest of SVM perf's parameters """ def __init__(self, svmperf_base=None, **kwargs): @@ -529,9 +706,15 @@ class SVMKLD(ELM): class SVMNKLD(ELM): """ - Esuli, A. and Sebastiani, F. (2015). - Optimizing text quantifiers for multivariate loss functions. - ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27. + SVM(NKLD), which attempts to minimize a version of the the Kullback-Leibler Divergence normalized + via the logistic function, as proposed by + `Esuli et al. 2015 `_. + Equivalent to: + + >>> ELM(svmperf_base, loss='nkld', **kwargs) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf` + :param kwargs: rest of SVM perf's parameters """ def __init__(self, svmperf_base=None, **kwargs): @@ -539,25 +722,60 @@ class SVMNKLD(ELM): class SVMAE(ELM): + """ + SVM(AE), which attempts to minimize Absolute Error as first used by + `Moreo and Sebastiani, 2021 `_. + Equivalent to: + + >>> ELM(svmperf_base, loss='mae', **kwargs) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf` + :param kwargs: rest of SVM perf's parameters + """ + def __init__(self, svmperf_base=None, **kwargs): super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs) class SVMRAE(ELM): + """ + SVM(RAE), which attempts to minimize Relative Absolute Error as first used by + `Moreo and Sebastiani, 2021 `_. + Equivalent to: + + >>> ELM(svmperf_base, loss='mrae', **kwargs) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf` + :param kwargs: rest of SVM perf's parameters + """ + def __init__(self, svmperf_base=None, **kwargs): super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs) class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier): + """ + Abstract class of Threshold Optimization variants for :class:`ACC` as proposed by + `Forman 2006 `_ and + `Forman 2008 `_. + The goal is to bring improved stability to the denominator of the adjustment. + The different variants are based on different heuristics for choosing a decision threshold + that would allow for more true positives and many more false positives, on the grounds this + would deliver larger denominators. + + :param learner: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ def __init__(self, learner: BaseEstimator, val_split=0.4): self.learner = learner self.val_split = val_split - @abstractmethod - def optimize_threshold(self, y, probabilities): - ... - def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): self._check_binary(data, "Threshold Optimization") @@ -574,7 +792,7 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier): pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') training = data.sampling_from_index(training_idx) validation = data.sampling_from_index(validation_idx) - learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation) + learner, val_data = _training_helper(self.learner, training, fit_learner, val_split=validation) probabilities.append(learner.predict_proba(val_data.instances)) y.append(val_data.labels) @@ -582,16 +800,16 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier): probabilities = np.concatenate(probabilities) # fit the learner on all data - self.learner, _ = training_helper(self.learner, data, fit_learner, val_split=None) + self.learner, _ = _training_helper(self.learner, data, fit_learner, val_split=None) else: - self.learner, val_data = training_helper(self.learner, data, fit_learner, val_split=val_split) + self.learner, val_data = _training_helper(self.learner, data, fit_learner, val_split=val_split) probabilities = self.learner.predict_proba(val_data.instances) y = val_data.labels self.cc = CC(self.learner) - self.tpr, self.fpr = self.optimize_threshold(y, probabilities) + self.tpr, self.fpr = self._optimize_threshold(y, probabilities) return self @@ -599,20 +817,32 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier): def _condition(self, tpr, fpr) -> float: """ Implements the criterion according to which the threshold should be selected. - This function should return a (float) score to be minimized. + This function should return the (float) score to be minimized. + + :param tpr: float, true positive rate + :param fpr: float, false positive rate + :return: float, a score for the given `tpr` and `fpr` """ ... - def optimize_threshold(self, y, probabilities): + def _optimize_threshold(self, y, probabilities): + """ + Seeks for the best `tpr` and `fpr` according to the score obtained at different + decision thresholds. The scoring function is implemented in function `_condition`. + + :param y: predicted labels for the validation set (or for the training set via `k`-fold cross validation) + :param probabilities: array-like with the posterior probabilities + :return: best `tpr` and `fpr` according to `_condition` + """ best_candidate_threshold_score = None best_tpr = 0 best_fpr = 0 candidate_thresholds = np.unique(probabilities[:, 1]) for candidate_threshold in candidate_thresholds: y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]] - TP, FP, FN, TN = self.compute_table(y, y_) - tpr = self.compute_tpr(TP, FP) - fpr = self.compute_fpr(FP, TN) + TP, FP, FN, TN = self._compute_table(y, y_) + tpr = self._compute_tpr(TP, FP) + fpr = self._compute_fpr(FP, TN) condition_score = self._condition(tpr, fpr) if best_candidate_threshold_score is None or condition_score < best_candidate_threshold_score: best_candidate_threshold_score = condition_score @@ -629,25 +859,40 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier): adjusted_prevs_estim = np.array((1 - adjusted_prevs_estim, adjusted_prevs_estim)) return adjusted_prevs_estim - def compute_table(self, y, y_): + def _compute_table(self, y, y_): TP = np.logical_and(y == y_, y == self.classes_[1]).sum() FP = np.logical_and(y != y_, y == self.classes_[0]).sum() FN = np.logical_and(y != y_, y == self.classes_[1]).sum() TN = np.logical_and(y == y_, y == self.classes_[0]).sum() return TP, FP, FN, TN - def compute_tpr(self, TP, FP): + def _compute_tpr(self, TP, FP): if TP + FP == 0: return 0 return TP / (TP + FP) - def compute_fpr(self, FP, TN): + def _compute_fpr(self, FP, TN): if FP + TN == 0: return 0 return FP / (FP + TN) class T50(ThresholdOptimization): + """ + Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 `_ and + `Forman 2008 `_ that looks + for the threshold that makes `tpr` cosest to 0.5. + The goal is to bring improved stability to the denominator of the adjustment. + + :param learner: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ def __init__(self, learner: BaseEstimator, val_split=0.4): super().__init__(learner, val_split) @@ -657,6 +902,21 @@ class T50(ThresholdOptimization): class MAX(ThresholdOptimization): + """ + Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 `_ and + `Forman 2008 `_ that looks + for the threshold that maximizes `tpr-fpr`. + The goal is to bring improved stability to the denominator of the adjustment. + + :param learner: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ def __init__(self, learner: BaseEstimator, val_split=0.4): super().__init__(learner, val_split) @@ -667,6 +927,21 @@ class MAX(ThresholdOptimization): class X(ThresholdOptimization): + """ + Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 `_ and + `Forman 2008 `_ that looks + for the threshold that yields `tpr=1-fpr`. + The goal is to bring improved stability to the denominator of the adjustment. + + :param learner: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ def __init__(self, learner: BaseEstimator, val_split=0.4): super().__init__(learner, val_split) @@ -676,41 +951,70 @@ class X(ThresholdOptimization): class MS(ThresholdOptimization): + """ + Median Sweep. Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 `_ and + `Forman 2008 `_ that generates + class prevalence estimates for all decision thresholds and returns the median of them all. + The goal is to bring improved stability to the denominator of the adjustment. + :param learner: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ def __init__(self, learner: BaseEstimator, val_split=0.4): super().__init__(learner, val_split) def _condition(self, tpr, fpr) -> float: pass - def optimize_threshold(self, y, probabilities): + def _optimize_threshold(self, y, probabilities): tprs = [] fprs = [] candidate_thresholds = np.unique(probabilities[:, 1]) for candidate_threshold in candidate_thresholds: y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]] - TP, FP, FN, TN = self.compute_table(y, y_) - tpr = self.compute_tpr(TP, FP) - fpr = self.compute_fpr(FP, TN) + TP, FP, FN, TN = self._compute_table(y, y_) + tpr = self._compute_tpr(TP, FP) + fpr = self._compute_fpr(FP, TN) tprs.append(tpr) fprs.append(fpr) return np.median(tprs), np.median(fprs) class MS2(MS): + """ + Median Sweep 2. Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 `_ and + `Forman 2008 `_ that generates + class prevalence estimates for all decision thresholds and returns the median of for cases in + which `tpr-fpr>0.25` + The goal is to bring improved stability to the denominator of the adjustment. + :param learner: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ def __init__(self, learner: BaseEstimator, val_split=0.4): super().__init__(learner, val_split) - def optimize_threshold(self, y, probabilities): + def _optimize_threshold(self, y, probabilities): tprs = [0, 1] fprs = [0, 1] candidate_thresholds = np.unique(probabilities[:, 1]) for candidate_threshold in candidate_thresholds: y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]] - TP, FP, FN, TN = self.compute_table(y, y_) - tpr = self.compute_tpr(TP, FP) - fpr = self.compute_fpr(FP, TN) + TP, FP, FN, TN = self._compute_table(y, y_) + tpr = self._compute_tpr(TP, FP) + fpr = self._compute_fpr(FP, TN) if (tpr - fpr) > 0.25: tprs.append(tpr) fprs.append(fpr) @@ -722,6 +1026,7 @@ AdjustedClassifyAndCount = ACC ProbabilisticClassifyAndCount = PCC ProbabilisticAdjustedClassifyAndCount = PACC ExpectationMaximizationQuantifier = EMQ +SLD = EMQ HellingerDistanceY = HDy ExplicitLossMinimisation = ELM MedianSweep = MS @@ -730,11 +1035,14 @@ MedianSweep2 = MS2 class OneVsAll(AggregativeQuantifier): """ - Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary - quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1. - This variant was used, along with the ExplicitLossMinimization quantifier in - Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. - Social Network Analysis and Mining 6(19), 1–22 (2016) + Allows any binary quantifier to perform quantification on single-label datasets. + The method maintains one binary quantifier for each class, and then l1-normalizes the outputs so that the + class prevelences sum up to 1. + This variant was used, along with the :class:`EMQ` quantifier, in + `Gao and Sebastiani, 2016 `_. + + :param learner: a sklearn's Estimator that generates a binary classifier + :param n_jobs: number of parallel workers """ def __init__(self, binary_quantifier, n_jobs=-1): @@ -753,18 +1061,30 @@ class OneVsAll(AggregativeQuantifier): return self def classify(self, instances): - # returns a matrix of shape (n,m) with n the number of instances and m the number of classes. The entry - # (i,j) is a binary value indicating whether instance i belongs to class j. The binary classifications are - # independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes. + """ + Returns a matrix of shape `(n,m,)` with `n` the number of instances and `m` the number of classes. The entry + `(i,j)` is a binary value indicating whether instance `i `belongs to class `j`. The binary classifications are + independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes. + + :param instances: array-like + :return: `np.ndarray` + """ + classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances) return classif_predictions_bin.T def posterior_probabilities(self, instances): - # returns a matrix of shape (n,m,2) with n the number of instances and m the number of classes. The entry - # (i,j,1) (resp. (i,j,0)) is a value in [0,1] indicating the posterior probability that instance i belongs - # (resp. does not belong) to class j. - # The posterior probabilities are independent of each other, meaning that, in general, they do not sum - # up to one. + """ + Returns a matrix of shape `(n,m,2)` with `n` the number of instances and `m` the number of classes. The entry + `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the posterior probability that instance `i` belongs + (resp. does not belong) to class `j`. + The posterior probabilities are independent of each other, meaning that, in general, they do not sum + up to one. + + :param instances: array-like + :return: `np.ndarray` + """ + if not self.binary_quantifier.probabilistic: raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because ' f'the base quantifier {self.binary_quantifier.__class__.__name__} is not ' @@ -811,7 +1131,7 @@ class OneVsAll(AggregativeQuantifier): return self.binary_quantifier.get_params() def _delayed_binary_classification(self, c, X): - return self.dict_binary_quantifiers[c].preclassify(X) + return self.dict_binary_quantifiers[c].classify(X) def _delayed_binary_posteriors(self, c, X): return self.dict_binary_quantifiers[c].posterior_probabilities(X) @@ -826,8 +1146,19 @@ class OneVsAll(AggregativeQuantifier): @property def binary(self): + """ + Informs that the classifier is not binary + + :return: False + """ return False @property def probabilistic(self): - return self.binary_quantifier.probabilistic + """ + Indicates if the classifier is probabilistic or not (depending on the nature of the base classifier). + + :return: boolean + """ + + return self.binary_quantifier.probabilistic \ No newline at end of file