import quapy as qp
from quapy.protocol import APP
from quapy.method.aggregative import DistributionMatching
from sklearn.linear_model import LogisticRegression
import numpy as np

"""
In this example, we show how to perform model selection on a DistributionMatching quantifier.
"""

model = DistributionMatching(LogisticRegression())

qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1

training, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test

# The model will be returned by the fit method of GridSearchQ.
# Every combination of hyper-parameters will be evaluated by confronting the
# quantifier thus configured against a series of samples generated by means
# of a sample generation protocol. For this example, we will use the
# artificial-prevalence protocol (APP), that generates samples with prevalence
# values in the entire range of values from a grid (e.g., [0, 0.1, 0.2, ..., 1]).
# We devote 30% of the dataset for this exploration.
training, validation = training.split_stratified(train_prop=0.7)
protocol = APP(validation)

# We will explore a classification-dependent hyper-parameter (e.g., the 'C'
# hyper-parameter of LogisticRegression) and a quantification-dependent hyper-parameter
# (e.g., the number of bins in a DistributionMatching quantifier.
# Classifier-dependent hyper-parameters have to be marked with a prefix "classifier__"
# in order to let the quantifier know this hyper-parameter belongs to its underlying
# classifier.
param_grid = {
    'classifier__C': np.logspace(-3,3,7),
    'nbins': [8, 16, 32, 64],
}

model = qp.model_selection.GridSearchQ(
    model=model,
    param_grid=param_grid,
    protocol=protocol,
    error='mae',  # the error to optimize is the MAE (a quantification-oriented loss)
    refit=True,   # retrain on the whole labelled set once done
    verbose=True  # show information as the process goes on
).fit(training)

print(f'model selection ended: best hyper-parameters={model.best_params_}')
model = model.best_model_

# evaluation in terms of MAE
# we use the same evaluation protocol (APP) on the test set
mae_score = qp.evaluation.evaluate(model, protocol=APP(test), error_metric='mae')

print(f'MAE={mae_score:.5f}')