diff --git a/examples/custom_quantifier.py b/examples/custom_quantifier.py index fa014de..9c89714 100644 --- a/examples/custom_quantifier.py +++ b/examples/custom_quantifier.py @@ -1,33 +1,79 @@ import quapy as qp from quapy.data import LabelledCollection -from quapy.method.base import BinaryQuantifier +from quapy.method.base import BinaryQuantifier, BaseQuantifier from quapy.model_selection import GridSearchQ from quapy.method.aggregative import AggregativeSoftQuantifier from quapy.protocol import APP import numpy as np from sklearn.linear_model import LogisticRegression +from time import time # Define a custom quantifier: for this example, we will consider a new quantification algorithm that uses a # logistic regressor for generating posterior probabilities, and then applies a custom threshold value to the # posteriors. Since the quantifier internally uses a classifier, it is an aggregative quantifier; and since it -# relies on posterior probabilities, it is a probabilistic-aggregative quantifier. Note also it has an -# internal hyperparameter (let say, alpha) which is the decision threshold. Let's also assume the quantifier -# is binary, for simplicity. +# relies on posterior probabilities, it is a probabilistic-aggregative quantifier (aka AggregativeSoftQuantifier). +# Note also it has an internal hyperparameter (let say, alpha) which is the decision threshold. +# +# Let's also assume the quantifier is binary, for simplicity. Any quantifier (i.e., any subclass of BaseQuantifier) +# is required to implement the "fit" and "quantify" methods. Aggregative quantifiers are special subtypes of base +# quantifiers, i.e., are quantifiers that undertake a classification-phase followed by an aggregation-phase. QuaPy +# already implements most common functionality, and requires the developer to simply implement the "aggregation_fit" +# and the "aggregation" methods. +# +# We are providing two implementations of the same method to illustrate this characteristic of QuaPy. Let us begin +# with the general case, in which we implement a (base) quantifier + +class MyQuantifier(BaseQuantifier): -class MyQuantifier(AggregativeSoftQuantifier, BinaryQuantifier): def __init__(self, classifier, alpha=0.5): self.alpha = alpha - # aggregative quantifiers have an internal self.classifier attribute self.classifier = classifier - def fit(self, data: LabelledCollection, fit_classifier=True): - assert fit_classifier, 'this quantifier needs to fit the classifier!' + # in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True, + # val_split=None); this would amount to: + def fit(self, data: LabelledCollection): + assert data.n_classes==2, \ + 'this quantifier is only valid for binary problems [abort]' self.classifier.fit(*data.Xy) return self - # in general, we would need to implement the method quantify(self, instances) but, since this method is of - # type aggregative, we can simply implement the method aggregate, which has the following interface + # in general, we would need to implement the method quantify(self, instances); this would amount to: + def quantify(self, instances): + assert hasattr(self.classifier, 'predict_proba'), \ + 'the underlying classifier is not probabilistic! [abort]' + posterior_probabilities = self.classifier.predict_proba(instances) + positive_probabilities = posterior_probabilities[:, 1] + crisp_decisions = positive_probabilities > self.alpha + pos_prev = crisp_decisions.mean() + neg_prev = 1 - pos_prev + return np.asarray([neg_prev, pos_prev]) + + +# Note that the above implementation contains a lot of boilerplate code. Many parts can be omitted since QuaPy +# provides implementations for them. Some of these routines (like, for example, training a classifier and generating +# posterior probabilities) are often carried out in a k-fold cross-validation manner. These, along with many other +# common routines are already provided by highly-optimized routines in QuaPy. Let's see a much better implementation +# of the method, now adhering to the AggregativeSoftQuantifier: + +class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier): + def __init__(self, classifier, alpha=0.5): + # aggregative quantifiers have an internal attribute called self.classifier + self.classifier = classifier + self.alpha = alpha + + # since this method is of type aggregative, we can simply implement the method aggregation_fit, which + # assumes the classifier has already been fitted properly and the predictions for the training set required + # to train the aggregation function have been properly generated (i.e., on a validation split, or using a + # k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case + # this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some + # basic functionality for checking binary consistency. + def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + pass + + # since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should + # only describe what to do with the classifier predictions --which in this case are posterior probabilities + # because we are inheriting from the "Soft" subtype). This comes down to: def aggregate(self, classif_predictions: np.ndarray): # the posterior probabilities have already been generated by the quantify method; we only need to # specify what to do with them @@ -38,31 +84,68 @@ class MyQuantifier(AggregativeSoftQuantifier, BinaryQuantifier): return np.asarray([neg_prev, pos_prev]) +# a small example using these two implementations of our method + if __name__ == '__main__': - qp.environ['SAMPLE_SIZE'] = 100 - - # define an instance of our custom quantifier - quantifier = MyQuantifier(LogisticRegression(), alpha=0.5) + qp.environ['SAMPLE_SIZE'] = 250 # load the IMDb dataset train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test + train, val = train.split_stratified(train_prop=0.75) # let's create a validation set for optimizing hyperparams - # model selection - # let us assume we want to explore our hyperparameter alpha along with one hyperparameter of the classifier - train, val = train.split_stratified(train_prop=0.75) - param_grid = { - 'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter - 'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter - } - quantifier = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(train) + def test_implementation(quantifier): + class_name = quantifier.__class__.__name__ + print(f'\ntesting implementation {class_name}...') + # model selection + # let us assume we want to explore our hyperparameter alpha along with one hyperparameter of the classifier + tinit = time() + param_grid = { + 'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter + 'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter + } + gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=False).fit(train) + t_modsel = time() - tinit + print(f'\tmodel selection took {t_modsel:.2f}s', flush=True) - # evaluation - mae = qp.evaluation.evaluate(quantifier, protocol=APP(test), error_metric='mae') + # evaluation + optimized_model = gridsearch.best_model_ + mae = qp.evaluation.evaluate( + optimized_model, + protocol=APP(test, repeats=5000, sanity_check=None), # disable the check, we want to generate many tests! + error_metric='mae', + verbose=True) - print(f'MAE = {mae:.4f}') + t_eval = time() - t_modsel - tinit + print(f'\tevaluation took {t_eval:.2f}s [MAE = {mae:.4f}]') - # final remarks: this method is only for demonstration purposes and makes little sense in general. The method relies + # define an instance of our custom quantifier and test it! + quantifier = MyQuantifier(LogisticRegression(), alpha=0.5) + test_implementation(quantifier) + + # define an instance of our custom quantifier, with the second implementation, and test it! + quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5) + test_implementation(quantifier) + + # the output should look like this: + """ + testing implementation MyQuantifier... + model selection took 12.86s + predicting: 100%|██████████| 105000/105000 [00:22<00:00, 4626.30it/s] + evaluation took 22.75s [MAE = 0.0630] + + testing implementation MyAggregativeSoftQuantifier... + model selection took 3.10s + speeding up the prediction for the aggregative quantifier, total classifications 25000 instead of 26250000 + predicting: 100%|██████████| 105000/105000 [00:04<00:00, 22779.62it/s] + evaluation took 4.66s [MAE = 0.0630] + """ + # Note that the first implementation is much slower, both in terms of grid-search optimization and in terms of + # evaluation. The reason why is that QuaPy is highly optimized for aggregative quantifiers (by far, the most + # popular type of quantification methods), thus significantly speeding up model selection and test routines. + # Furthermore, it is simpler to extend an aggregation type since QuaPy implements boilerplate functions for you. + + # Final remarks: this method is only for demonstration purposes and makes little sense in general. The method relies # on an hyperparameter alpha for binarizing the posterior probabilities. A much better way for fulfilling this # goal would be to calibrate the classifier (LogisticRegression is already reasonably well calibrated) and then # simply cut at 0.5.