QuaPy/examples/custom_quantifier.py

import quapy as qp
from quapy.data import LabelledCollection
from quapy.method.base import BinaryQuantifier
from quapy.model_selection import GridSearchQ
from quapy.method.aggregative import AggregativeProbabilisticQuantifier
from quapy.protocol import APP
import numpy as np
from sklearn.linear_model import LogisticRegression


# Define a custom quantifier: for this example, we will consider a new quantification algorithm that uses a
# logistic regressor for generating posterior probabilities, and then applies a custom threshold value to the
# posteriors. Since the quantifier internally uses a classifier, it is an aggregative quantifier; and since it
# relies on posterior probabilities, it is a probabilistic-aggregative quantifier. Note also it has an
# internal hyperparameter (let say, alpha) which is the decision threshold. Let's also assume the quantifier
# is binary, for simplicity.

class MyQuantifier(AggregativeProbabilisticQuantifier, BinaryQuantifier):
    def __init__(self, classifier, alpha=0.5):
        self.alpha = alpha
        # aggregative quantifiers have an internal self.classifier attribute
        self.classifier = classifier

    def fit(self, data: LabelledCollection, fit_classifier=True):
        assert fit_classifier, 'this quantifier needs to fit the classifier!'
        self.classifier.fit(*data.Xy)
        return self

    # in general, we would need to implement the method quantify(self, instances) but, since this method is of
    # type aggregative, we can simply implement the method aggregate, which has the following interface
    def aggregate(self, classif_predictions: np.ndarray):
        # the posterior probabilities have already been generated by the quantify method; we only need to
        # specify what to do with them
        positive_probabilities = classif_predictions[:, 1]
        crisp_decisions = positive_probabilities > self.alpha
        pos_prev = crisp_decisions.mean()
        neg_prev = 1-pos_prev
        return np.asarray([neg_prev, pos_prev])


if __name__ == '__main__':

    qp.environ['SAMPLE_SIZE'] = 100

    # define an instance of our custom quantifier
    quantifier = MyQuantifier(LogisticRegression(), alpha=0.5)

    # load the IMDb dataset
    train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test

    # model selection
    # let us assume we want to explore our hyperparameter alpha along with one hyperparameter of the classifier
    train, val = train.split_stratified(train_prop=0.75)
    param_grid = {
        'alpha': np.linspace(0, 1, 11),         # quantifier-dependent hyperparameter
        'classifier__C': np.logspace(-2, 2, 5)  # classifier-dependent hyperparameter
    }
    quantifier = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(train)

    # evaluation
    mae = qp.evaluation.evaluate(quantifier, protocol=APP(test), error_metric='mae')

    print(f'MAE = {mae:.4f}')

    # final remarks: this method is only for demonstration purposes and makes little sense in general. The method relies
    # on an hyperparameter alpha for binarizing the posterior probabilities. A much better way for fulfilling this
    # goal would be to calibrate the classifier (LogisticRegression is already reasonably well calibrated) and then
    # simply cut at 0.5.
adding documentation and adding one new example 2023-02-08 19:06:53 +01:00			`import quapy as qp`
import fix 2023-02-14 19:15:59 +01:00			`from quapy.data import LabelledCollection`
			`from quapy.method.base import BinaryQuantifier`
			`from quapy.model_selection import GridSearchQ`
			`from quapy.method.aggregative import AggregativeProbabilisticQuantifier`
adding documentation and adding one new example 2023-02-08 19:06:53 +01:00			`from quapy.protocol import APP`
			`import numpy as np`
			`from sklearn.linear_model import LogisticRegression`


			`# Define a custom quantifier: for this example, we will consider a new quantification algorithm that uses a`
			`# logistic regressor for generating posterior probabilities, and then applies a custom threshold value to the`
			`# posteriors. Since the quantifier internally uses a classifier, it is an aggregative quantifier; and since it`
fixing bugs in one-vs-all 2023-02-10 19:02:17 +01:00			`# relies on posterior probabilities, it is a probabilistic-aggregative quantifier. Note also it has an`
adding documentation and adding one new example 2023-02-08 19:06:53 +01:00			`# internal hyperparameter (let say, alpha) which is the decision threshold. Let's also assume the quantifier`
			`# is binary, for simplicity.`

			`class MyQuantifier(AggregativeProbabilisticQuantifier, BinaryQuantifier):`
			`def __init__(self, classifier, alpha=0.5):`
			`self.alpha = alpha`
			`# aggregative quantifiers have an internal self.classifier attribute`
			`self.classifier = classifier`

			`def fit(self, data: LabelledCollection, fit_classifier=True):`
			`assert fit_classifier, 'this quantifier needs to fit the classifier!'`
			`self.classifier.fit(*data.Xy)`
			`return self`

			`# in general, we would need to implement the method quantify(self, instances) but, since this method is of`
			`# type aggregative, we can simply implement the method aggregate, which has the following interface`
			`def aggregate(self, classif_predictions: np.ndarray):`
			`# the posterior probabilities have already been generated by the quantify method; we only need to`
			`# specify what to do with them`
			`positive_probabilities = classif_predictions[:, 1]`
			`crisp_decisions = positive_probabilities > self.alpha`
			`pos_prev = crisp_decisions.mean()`
			`neg_prev = 1-pos_prev`
			`return np.asarray([neg_prev, pos_prev])`


			`if __name__ == '__main__':`

			`qp.environ['SAMPLE_SIZE'] = 100`

			`# define an instance of our custom quantifier`
			`quantifier = MyQuantifier(LogisticRegression(), alpha=0.5)`

			`# load the IMDb dataset`
			`train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test`

			`# model selection`
			`# let us assume we want to explore our hyperparameter alpha along with one hyperparameter of the classifier`
fixing bugs in one-vs-all 2023-02-10 19:02:17 +01:00			`train, val = train.split_stratified(train_prop=0.75)`
adding documentation and adding one new example 2023-02-08 19:06:53 +01:00			`param_grid = {`
fixing bugs in one-vs-all 2023-02-10 19:02:17 +01:00			`'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter`
			`'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter`
adding documentation and adding one new example 2023-02-08 19:06:53 +01:00			`}`
			`quantifier = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(train)`

			`# evaluation`
			`mae = qp.evaluation.evaluate(quantifier, protocol=APP(test), error_metric='mae')`

			`print(f'MAE = {mae:.4f}')`

			`# final remarks: this method is only for demonstration purposes and makes little sense in general. The method relies`
			`# on an hyperparameter alpha for binarizing the posterior probabilities. A much better way for fulfilling this`
			`# goal would be to calibrate the classifier (LogisticRegression is already reasonably well calibrated) and then`
			`# simply cut at 0.5.`