2023-02-14 17:00:50 +01:00
|
|
|
import quapy as qp
|
2024-01-25 14:33:41 +01:00
|
|
|
from method._kdey import KDEyML
|
2023-11-16 19:56:30 +01:00
|
|
|
from quapy.method.non_aggregative import DMx
|
2023-12-18 15:43:36 +01:00
|
|
|
from quapy.protocol import APP, UPP
|
2023-11-09 18:13:54 +01:00
|
|
|
from quapy.method.aggregative import DMy
|
2023-02-14 17:00:50 +01:00
|
|
|
from sklearn.linear_model import LogisticRegression
|
2023-11-16 14:29:34 +01:00
|
|
|
from examples.comparing_gridsearch import OLD_GridSearchQ
|
2023-02-14 17:00:50 +01:00
|
|
|
import numpy as np
|
2023-11-16 14:29:34 +01:00
|
|
|
from time import time
|
2023-02-14 17:00:50 +01:00
|
|
|
|
|
|
|
"""
|
|
|
|
In this example, we show how to perform model selection on a DistributionMatching quantifier.
|
|
|
|
"""
|
|
|
|
|
2023-12-18 10:24:36 +01:00
|
|
|
model = KDEyML(LogisticRegression())
|
2023-02-14 17:00:50 +01:00
|
|
|
|
|
|
|
qp.environ['SAMPLE_SIZE'] = 100
|
|
|
|
qp.environ['N_JOBS'] = -1
|
|
|
|
|
2023-12-18 10:24:36 +01:00
|
|
|
# training, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
|
2023-12-18 15:43:36 +01:00
|
|
|
training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test
|
2023-02-14 17:00:50 +01:00
|
|
|
|
2023-11-16 14:29:34 +01:00
|
|
|
with qp.util.temp_seed(0):
|
|
|
|
|
|
|
|
# The model will be returned by the fit method of GridSearchQ.
|
|
|
|
# Every combination of hyper-parameters will be evaluated by confronting the
|
|
|
|
# quantifier thus configured against a series of samples generated by means
|
|
|
|
# of a sample generation protocol. For this example, we will use the
|
|
|
|
# artificial-prevalence protocol (APP), that generates samples with prevalence
|
|
|
|
# values in the entire range of values from a grid (e.g., [0, 0.1, 0.2, ..., 1]).
|
|
|
|
# We devote 30% of the dataset for this exploration.
|
|
|
|
training, validation = training.split_stratified(train_prop=0.7)
|
2023-12-18 15:43:36 +01:00
|
|
|
protocol = UPP(validation)
|
2023-11-16 14:29:34 +01:00
|
|
|
|
|
|
|
# We will explore a classification-dependent hyper-parameter (e.g., the 'C'
|
|
|
|
# hyper-parameter of LogisticRegression) and a quantification-dependent hyper-parameter
|
|
|
|
# (e.g., the number of bins in a DistributionMatching quantifier.
|
|
|
|
# Classifier-dependent hyper-parameters have to be marked with a prefix "classifier__"
|
|
|
|
# in order to let the quantifier know this hyper-parameter belongs to its underlying
|
|
|
|
# classifier.
|
|
|
|
param_grid = {
|
|
|
|
'classifier__C': np.logspace(-3,3,7),
|
|
|
|
'classifier__class_weight': ['balanced', None],
|
2023-12-18 10:24:36 +01:00
|
|
|
'bandwidth': np.linspace(0.01, 0.2, 20),
|
2023-11-16 14:29:34 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
tinit = time()
|
|
|
|
|
2023-12-18 17:15:53 +01:00
|
|
|
# model = OLD_GridSearchQ(
|
|
|
|
model = qp.model_selection.GridSearchQ(
|
2023-11-16 14:29:34 +01:00
|
|
|
model=model,
|
|
|
|
param_grid=param_grid,
|
|
|
|
protocol=protocol,
|
|
|
|
error='mae', # the error to optimize is the MAE (a quantification-oriented loss)
|
|
|
|
refit=False, # retrain on the whole labelled set once done
|
2023-12-18 15:43:36 +01:00
|
|
|
# raise_errors=False,
|
2023-11-16 14:29:34 +01:00
|
|
|
verbose=True # show information as the process goes on
|
|
|
|
).fit(training)
|
|
|
|
|
|
|
|
tend = time()
|
2023-02-14 17:00:50 +01:00
|
|
|
|
|
|
|
print(f'model selection ended: best hyper-parameters={model.best_params_}')
|
|
|
|
model = model.best_model_
|
|
|
|
|
|
|
|
# evaluation in terms of MAE
|
|
|
|
# we use the same evaluation protocol (APP) on the test set
|
2023-12-18 15:43:36 +01:00
|
|
|
mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
|
2023-02-14 17:00:50 +01:00
|
|
|
|
|
|
|
print(f'MAE={mae_score:.5f}')
|
2023-11-21 18:59:36 +01:00
|
|
|
print(f'model selection took {tend-tinit:.1f}s')
|
2023-02-14 17:00:50 +01:00
|
|
|
|