import quapy as qp from quapy.data import LabelledCollection from quapy.method.base import BinaryQuantifier, BaseQuantifier from quapy.model_selection import GridSearchQ from quapy.method.aggregative import AggregativeSoftQuantifier from quapy.protocol import APP import numpy as np from sklearn.linear_model import LogisticRegression from time import time # Define a custom quantifier: for this example, we will consider a new quantification algorithm that uses a # logistic regressor for generating posterior probabilities, and then applies a custom threshold value to the # posteriors. Since the quantifier internally uses a classifier, it is an aggregative quantifier; and since it # relies on posterior probabilities, it is a probabilistic-aggregative quantifier (aka AggregativeSoftQuantifier). # Note also it has an internal hyperparameter (let say, alpha) which is the decision threshold. # # Let's also assume the quantifier is binary, for simplicity. Any quantifier (i.e., any subclass of BaseQuantifier) # is required to implement the "fit" and "quantify" methods. Aggregative quantifiers are special subtypes of base # quantifiers, i.e., are quantifiers that undertake a classification-phase followed by an aggregation-phase. QuaPy # already implements most common functionality, and requires the developer to simply implement the "aggregation_fit" # and the "aggregation" methods. # # We are providing two implementations of the same method to illustrate this characteristic of QuaPy. Let us begin # with the general case, in which we implement a (base) quantifier class MyQuantifier(BaseQuantifier): def __init__(self, classifier, alpha=0.5): self.alpha = alpha self.classifier = classifier # in general, we would need to implement the method fit(self, data: LabelledCollection, fit_classifier=True, # val_split=None); this would amount to: def fit(self, data: LabelledCollection): assert data.n_classes==2, \ 'this quantifier is only valid for binary problems [abort]' self.classifier.fit(*data.Xy) return self # in general, we would need to implement the method quantify(self, instances); this would amount to: def quantify(self, instances): assert hasattr(self.classifier, 'predict_proba'), \ 'the underlying classifier is not probabilistic! [abort]' posterior_probabilities = self.classifier.predict_proba(instances) positive_probabilities = posterior_probabilities[:, 1] crisp_decisions = positive_probabilities > self.alpha pos_prev = crisp_decisions.mean() neg_prev = 1 - pos_prev return np.asarray([neg_prev, pos_prev]) # Note that the above implementation contains a lot of boilerplate code. Many parts can be omitted since QuaPy # provides implementations for them. Some of these routines (like, for example, training a classifier and generating # posterior probabilities) are often carried out in a k-fold cross-validation manner. These, along with many other # common routines are already provided by highly-optimized routines in QuaPy. Let's see a much better implementation # of the method, now adhering to the AggregativeSoftQuantifier: class MyAggregativeSoftQuantifier(AggregativeSoftQuantifier, BinaryQuantifier): def __init__(self, classifier, alpha=0.5): # aggregative quantifiers have an internal attribute called self.classifier self.classifier = classifier self.alpha = alpha # since this method is of type aggregative, we can simply implement the method aggregation_fit, which # assumes the classifier has already been fitted properly and the predictions for the training set required # to train the aggregation function have been properly generated (i.e., on a validation split, or using a # k-fold cross validation strategy). What remains ahead is to learn an aggregation function. In our case # this amounts to doing... nothing, since our method was pretty basic. BinaryQuantifier also add some # basic functionality for checking binary consistency. def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): pass # since this method is of type aggregative, we can simply implement the method aggregate (i.e., we should # only describe what to do with the classifier predictions --which in this case are posterior probabilities # because we are inheriting from the "Soft" subtype). This comes down to: def aggregate(self, classif_predictions: np.ndarray): # the posterior probabilities have already been generated by the quantify method; we only need to # specify what to do with them positive_probabilities = classif_predictions[:, 1] crisp_decisions = positive_probabilities > self.alpha pos_prev = crisp_decisions.mean() neg_prev = 1-pos_prev return np.asarray([neg_prev, pos_prev]) # a small example using these two implementations of our method if __name__ == '__main__': qp.environ['SAMPLE_SIZE'] = 250 # load the IMDb dataset train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test train, val = train.split_stratified(train_prop=0.75) # let's create a validation set for optimizing hyperparams def test_implementation(quantifier): class_name = quantifier.__class__.__name__ print(f'\ntesting implementation {class_name}...') # model selection # let us assume we want to explore our hyperparameter alpha along with one hyperparameter of the classifier tinit = time() param_grid = { 'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter 'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter } gridsearch = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=False).fit(train) t_modsel = time() - tinit print(f'\tmodel selection took {t_modsel:.2f}s', flush=True) # evaluation optimized_model = gridsearch.best_model_ mae = qp.evaluation.evaluate( optimized_model, protocol=APP(test, repeats=5000, sanity_check=None), # disable the check, we want to generate many tests! error_metric='mae', verbose=True) t_eval = time() - t_modsel - tinit print(f'\tevaluation took {t_eval:.2f}s [MAE = {mae:.4f}]') # define an instance of our custom quantifier and test it! quantifier = MyQuantifier(LogisticRegression(), alpha=0.5) test_implementation(quantifier) # define an instance of our custom quantifier, with the second implementation, and test it! quantifier = MyAggregativeSoftQuantifier(LogisticRegression(), alpha=0.5) test_implementation(quantifier) # the output should look like this: """ testing implementation MyQuantifier... model selection took 12.86s predicting: 100%|██████████| 105000/105000 [00:22<00:00, 4626.30it/s] evaluation took 22.75s [MAE = 0.0630] testing implementation MyAggregativeSoftQuantifier... model selection took 3.10s speeding up the prediction for the aggregative quantifier, total classifications 25000 instead of 26250000 predicting: 100%|██████████| 105000/105000 [00:04<00:00, 22779.62it/s] evaluation took 4.66s [MAE = 0.0630] """ # Note that the first implementation is much slower, both in terms of grid-search optimization and in terms of # evaluation. The reason why is that QuaPy is highly optimized for aggregative quantifiers (by far, the most # popular type of quantification methods), thus significantly speeding up model selection and test routines. # Furthermore, it is simpler to extend an aggregation type since QuaPy implements boilerplate functions for you. # Final remarks: this method is only for demonstration purposes and makes little sense in general. The method relies # on an hyperparameter alpha for binarizing the posterior probabilities. A much better way for fulfilling this # goal would be to calibrate the classifier (LogisticRegression is already reasonably well calibrated) and then # simply cut at 0.5.