QuaPy/examples/model_selection.py

import quapy as qp
from method._kdey import KDEyML
from quapy.method.non_aggregative import DMx
from quapy.protocol import APP, UPP
from quapy.method.aggregative import DMy
from sklearn.linear_model import LogisticRegression
from examples.comparing_gridsearch import OLD_GridSearchQ
import numpy as np
from time import time

"""
In this example, we show how to perform model selection on a DistributionMatching quantifier.
"""

model = KDEyML(LogisticRegression())

qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1

# training, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test

with qp.util.temp_seed(0):

    # The model will be returned by the fit method of GridSearchQ.
    # Every combination of hyper-parameters will be evaluated by confronting the
    # quantifier thus configured against a series of samples generated by means
    # of a sample generation protocol. For this example, we will use the
    # artificial-prevalence protocol (APP), that generates samples with prevalence
    # values in the entire range of values from a grid (e.g., [0, 0.1, 0.2, ..., 1]).
    # We devote 30% of the dataset for this exploration.
    training, validation = training.split_stratified(train_prop=0.7)
    protocol = UPP(validation)

    # We will explore a classification-dependent hyper-parameter (e.g., the 'C'
    # hyper-parameter of LogisticRegression) and a quantification-dependent hyper-parameter
    # (e.g., the number of bins in a DistributionMatching quantifier.
    # Classifier-dependent hyper-parameters have to be marked with a prefix "classifier__"
    # in order to let the quantifier know this hyper-parameter belongs to its underlying
    # classifier.
    param_grid = {
        'classifier__C': np.logspace(-3,3,7),
        'classifier__class_weight': ['balanced', None],
        'bandwidth': np.linspace(0.01, 0.2, 20),
    }

    tinit = time()

    # model = OLD_GridSearchQ(
    model = qp.model_selection.GridSearchQ(
        model=model,
        param_grid=param_grid,
        protocol=protocol,
        error='mae',  # the error to optimize is the MAE (a quantification-oriented loss)
        refit=False,   # retrain on the whole labelled set once done
        # raise_errors=False,
        verbose=True  # show information as the process goes on
    ).fit(training)

tend = time()

print(f'model selection ended: best hyper-parameters={model.best_params_}')
model = model.best_model_

# evaluation in terms of MAE
# we use the same evaluation protocol (APP) on the test set
mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')

print(f'MAE={mae_score:.5f}')
print(f'model selection took {tend-tinit:.1f}s')
preparing to merge 2023-02-14 17:00:50 +01:00			`import quapy as qp`
refactoring aggregation methods 2024-01-25 14:33:41 +01:00			`from method._kdey import KDEyML`
model selection with error handling 2023-11-16 19:56:30 +01:00			`from quapy.method.non_aggregative import DMx`
kdey within the new grid search 2023-12-18 15:43:36 +01:00			`from quapy.protocol import APP, UPP`
added DMx and DMy, with a classmethod that returns HDx and HDy respectively 2023-11-09 18:13:54 +01:00			`from quapy.method.aggregative import DMy`
preparing to merge 2023-02-14 17:00:50 +01:00			`from sklearn.linear_model import LogisticRegression`
model seletion in two levels, classifier oriented and quantifier oriented 2023-11-16 14:29:34 +01:00			`from examples.comparing_gridsearch import OLD_GridSearchQ`
preparing to merge 2023-02-14 17:00:50 +01:00			`import numpy as np`
model seletion in two levels, classifier oriented and quantifier oriented 2023-11-16 14:29:34 +01:00			`from time import time`
preparing to merge 2023-02-14 17:00:50 +01:00
			`"""`
			`In this example, we show how to perform model selection on a DistributionMatching quantifier.`
			`"""`

mergin 2023-12-18 10:24:36 +01:00			`model = KDEyML(LogisticRegression())`
preparing to merge 2023-02-14 17:00:50 +01:00
			`qp.environ['SAMPLE_SIZE'] = 100`
			`qp.environ['N_JOBS'] = -1`

mergin 2023-12-18 10:24:36 +01:00			`# training, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test`
kdey within the new grid search 2023-12-18 15:43:36 +01:00			`training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test`
preparing to merge 2023-02-14 17:00:50 +01:00
model seletion in two levels, classifier oriented and quantifier oriented 2023-11-16 14:29:34 +01:00			`with qp.util.temp_seed(0):`

			`# The model will be returned by the fit method of GridSearchQ.`
			`# Every combination of hyper-parameters will be evaluated by confronting the`
			`# quantifier thus configured against a series of samples generated by means`
			`# of a sample generation protocol. For this example, we will use the`
			`# artificial-prevalence protocol (APP), that generates samples with prevalence`
			`# values in the entire range of values from a grid (e.g., [0, 0.1, 0.2, ..., 1]).`
			`# We devote 30% of the dataset for this exploration.`
			`training, validation = training.split_stratified(train_prop=0.7)`
kdey within the new grid search 2023-12-18 15:43:36 +01:00			`protocol = UPP(validation)`
model seletion in two levels, classifier oriented and quantifier oriented 2023-11-16 14:29:34 +01:00
			`# We will explore a classification-dependent hyper-parameter (e.g., the 'C'`
			`# hyper-parameter of LogisticRegression) and a quantification-dependent hyper-parameter`
			`# (e.g., the number of bins in a DistributionMatching quantifier.`
			`# Classifier-dependent hyper-parameters have to be marked with a prefix "classifier__"`
			`# in order to let the quantifier know this hyper-parameter belongs to its underlying`
			`# classifier.`
			`param_grid = {`
			`'classifier__C': np.logspace(-3,3,7),`
			`'classifier__class_weight': ['balanced', None],`
mergin 2023-12-18 10:24:36 +01:00			`'bandwidth': np.linspace(0.01, 0.2, 20),`
model seletion in two levels, classifier oriented and quantifier oriented 2023-11-16 14:29:34 +01:00			`}`

			`tinit = time()`

bugfix in APP 2023-12-18 17:15:53 +01:00			`# model = OLD_GridSearchQ(`
			`model = qp.model_selection.GridSearchQ(`
model seletion in two levels, classifier oriented and quantifier oriented 2023-11-16 14:29:34 +01:00			`model=model,`
			`param_grid=param_grid,`
			`protocol=protocol,`
			`error='mae', # the error to optimize is the MAE (a quantification-oriented loss)`
			`refit=False, # retrain on the whole labelled set once done`
kdey within the new grid search 2023-12-18 15:43:36 +01:00			`# raise_errors=False,`
model seletion in two levels, classifier oriented and quantifier oriented 2023-11-16 14:29:34 +01:00			`verbose=True # show information as the process goes on`
			`).fit(training)`

			`tend = time()`
preparing to merge 2023-02-14 17:00:50 +01:00
			`print(f'model selection ended: best hyper-parameters={model.best_params_}')`
			`model = model.best_model_`

			`# evaluation in terms of MAE`
			`# we use the same evaluation protocol (APP) on the test set`
kdey within the new grid search 2023-12-18 15:43:36 +01:00			`mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')`
preparing to merge 2023-02-14 17:00:50 +01:00
			`print(f'MAE={mae_score:.5f}')`
grid search almost complete 2023-11-21 18:59:36 +01:00			`print(f'model selection took {tend-tinit:.1f}s')`
preparing to merge 2023-02-14 17:00:50 +01:00