improving the custom quantifier example

2024-03-06 11:46:25 +01:00 · 2024-03-06 11:46:25 +01:00 · b43eafa36f
parent b3ccf71edb
commit b43eafa36f
1 changed files with 109 additions and 26 deletions
--- a/examples/custom_quantifier.py
+++ b/examples/custom_quantifier.py
@ -1,33 +1,79 @@
 import quapy as qp
 from quapy.data import LabelledCollection
-from quapy.method.base import BinaryQuantifier
+from quapy.method.base import BinaryQuantifier, BaseQuantifier
 from quapy.model_selection import GridSearchQ
 from quapy.method.aggregative import AggregativeSoftQuantifier
 from quapy.protocol import APP
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 from time import time
 # Define a custom quantifier: for this example, we will consider a new quantification algorithm that uses a
 # logistic regressor for generating posterior probabilities, and then applies a custom threshold value to the
 # posteriors. Since the quantifier internally uses a classifier, it is an aggregative quantifier; and since it
-# relies on posterior probabilities, it is a probabilistic-aggregative quantifier. Note also it has an
+# relies on posterior probabilities, it is a probabilistic-aggregative quantifier (aka AggregativeSoftQuantifier).
-# internal hyperparameter (let say, alpha) which is the decision threshold. Let's also assume the quantifier
+# Note also it has an internal hyperparameter (let say, alpha) which is the decision threshold.
-# is binary, for simplicity.
+#
 # Let's also assume the quantifier is binary, for simplicity. Any quantifier (i.e., any subclass of BaseQuantifier)
 # is required to implement the "fit" and "quantify" methods. Aggregative quantifiers are special subtypes of base
 # quantifiers, i.e., are quantifiers that undertake a classification-phase followed by an aggregation-phase. QuaPy
 # already implements most common functionality, and requires the developer to simply implement the "aggregation_fit"
 # and the "aggregation" methods.
 #
 # We are providing two implementations of the same method to illustrate this characteristic of QuaPy. Let us begin
 # with the general case, in which we implement a (base) quantifier
 class MyQuantifier(BaseQuantifier):
 class MyQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
    def __init__(self, classifier, alpha=0.5):
        self.alpha = alpha
        # aggregative quantifiers have an internal self.classifier attribute
        self.classifier = classifier
-    def fit(self, data: LabelledCollection, fit_classifier=True):
+    # in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True,
-        assert fit_classifier, 'this quantifier needs to fit the classifier!'
+    # val_split=None); this would amount to:
    def fit(self, data: LabelledCollection):
        assert data.n_classes==2, \
            'this quantifier is only valid for binary problems [abort]'
        self.classifier.fit(*data.Xy)
        return self
-    # in general, we would need to implement the method quantify(self, instances) but, since this method is of
+    # in general, we would need to implement the method quantify(self, instances); this would amount to:
-    # type aggregative, we can simply implement the method aggregate, which has the following interface
+    def quantify(self, instances):
        assert hasattr(self.classifier, 'predict_proba'), \
            'the underlying classifier is not probabilistic! [abort]'
        posterior_probabilities = self.classifier.predict_proba(instances)
        positive_probabilities = posterior_probabilities[:, 1]
        crisp_decisions = positive_probabilities > self.alpha
        pos_prev = crisp_decisions.mean()
        neg_prev = 1 - pos_prev
        return np.asarray([neg_prev, pos_prev])
 # Note that the above implementation contains a lot of boilerplate code. Many parts can be omitted since QuaPy
 # provides implementations for them. Some of these routines (like, for example, training a classifier and generating
 # posterior probabilities) are often carried out in a k-fold cross-validation manner. These, along with many other
 # common routines are already provided by highly-optimized routines in QuaPy. Let's see a much better implementation
 # of the method, now adhering to the AggregativeSoftQuantifier:
 class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
    def __init__(self, classifier, alpha=0.5):
        # aggregative quantifiers have an internal attribute called self.classifier
        self.classifier = classifier
        self.alpha = alpha
    # since this method is of type aggregative, we can simply implement the method aggregation_fit, which
    # assumes the classifier has already been fitted properly and the predictions for the training set required
    # to train the aggregation function have been properly generated (i.e., on a validation split, or using a
    # k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case
    # this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some
    # basic functionality for checking binary consistency.
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
        pass
    # since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should
    # only describe what to do with the classifier predictions --which in this case are posterior probabilities
    # because we are inheriting from the "Soft" subtype). This comes down to:
    def aggregate(self, classif_predictions: np.ndarray):
        # the posterior probabilities have already been generated by the quantify method; we only need to
        # specify what to do with them
@ -38,31 +84,68 @@ class MyQuantifier(AggregativeSoftQuantifier, BinaryQuantifier):
        return np.asarray([neg_prev, pos_prev])
 # a small example using these two implementations of our method
 if __name__ == '__main__':
-    qp.environ['SAMPLE_SIZE'] = 100
+    qp.environ['SAMPLE_SIZE'] = 250
    # define an instance of our custom quantifier
    quantifier = MyQuantifier(LogisticRegression(), alpha=0.5)
    # load the IMDb dataset
    train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
    train, val = train.split_stratified(train_prop=0.75)  # let's create a validation set for optimizing hyperparams
-    # model selection
+    def test_implementation(quantifier):
-    # let us assume we want to explore our hyperparameter alpha along with one hyperparameter of the classifier
+        class_name = quantifier.__class__.__name__
-    train, val = train.split_stratified(train_prop=0.75)
+        print(f'\ntesting implementation {class_name}...')
-    param_grid = {
+        # model selection
-        'alpha': np.linspace(0, 1, 11),         # quantifier-dependent hyperparameter
+        # let us assume we want to explore our hyperparameter alpha along with one hyperparameter of the classifier
-        'classifier__C': np.logspace(-2, 2, 5)  # classifier-dependent hyperparameter
+        tinit = time()
-    }
+        param_grid = {
-    quantifier = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(train)
+            'alpha': np.linspace(0, 1, 11),         # quantifier-dependent hyperparameter
            'classifier__C': np.logspace(-2, 2, 5)  # classifier-dependent hyperparameter
        }
        gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=False).fit(train)
        t_modsel = time() - tinit
        print(f'\tmodel selection took {t_modsel:.2f}s', flush=True)
-    # evaluation
+        # evaluation
-    mae = qp.evaluation.evaluate(quantifier, protocol=APP(test), error_metric='mae')
+        optimized_model = gridsearch.best_model_
        mae = qp.evaluation.evaluate(
            optimized_model,
            protocol=APP(test, repeats=5000, sanity_check=None),  # disable the check, we want to generate many tests!
            error_metric='mae',
            verbose=True)
-    print(f'MAE = {mae:.4f}')
+        t_eval = time() - t_modsel - tinit
        print(f'\tevaluation took {t_eval:.2f}s [MAE = {mae:.4f}]')
-    # final remarks: this method is only for demonstration purposes and makes little sense in general. The method relies
+    # define an instance of our custom quantifier and test it!
    quantifier = MyQuantifier(LogisticRegression(), alpha=0.5)
    test_implementation(quantifier)
    # define an instance of our custom quantifier, with the second implementation, and test it!
    quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5)
    test_implementation(quantifier)
    # the output should look like this:
    """
    testing implementation MyQuantifier...
        model selection took 12.86s
    predicting: 100%|██████████| 105000/105000 [00:22<00:00, 4626.30it/s]
        evaluation took 22.75s [MAE = 0.0630]
    testing implementation MyAggregativeSoftQuantifier...
        model selection took 3.10s
    speeding up the prediction for the aggregative quantifier, total classifications 25000 instead of 26250000
    predicting: 100%|██████████| 105000/105000 [00:04<00:00, 22779.62it/s]
        evaluation took 4.66s [MAE = 0.0630]
    """
    # Note that the first implementation is much slower, both in terms of grid-search optimization and in terms of
    # evaluation. The reason why is that QuaPy is highly optimized for aggregative quantifiers (by far, the most
    # popular type of quantification methods), thus significantly speeding up model selection and test routines.
    # Furthermore, it is simpler to extend an aggregation type since QuaPy implements boilerplate functions for you.
    # Final remarks: this method is only for demonstration purposes and makes little sense in general. The method relies
    # on an hyperparameter alpha for binarizing the posterior probabilities. A much better way for fulfilling this
    # goal would be to calibrate the classifier (LogisticRegression is already reasonably well calibrated) and then
    # simply cut at 0.5.