diff --git a/KDEy/kdey_devel.py b/KDEy/kdey_devel.py index 0c8baae..ef16d2a 100644 --- a/KDEy/kdey_devel.py +++ b/KDEy/kdey_devel.py @@ -4,13 +4,15 @@ from sklearn.base import BaseEstimator from sklearn.neighbors import KernelDensity import quapy as qp +from quapy.protocol import UPP +from quapy.method._kdey import KDEBase from quapy.data import LabelledCollection from quapy.method.aggregative import AggregativeSoftQuantifier, KDEyML import quapy.functional as F from sklearn.metrics.pairwise import rbf_kernel from scipy import optimize - +from tqdm import tqdm epsilon = 1e-10 @@ -63,10 +65,6 @@ class KDEyMLauto(KDEyML): current_prevalence, current_bandwidth = self.optim_minimize_both(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) elif self.optim == 'both_fine': current_prevalence, current_bandwidth = self.optim_minimize_both_fine(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) - elif self.optim == 'both_fine': - current_prevalence, current_bandwidth = self.optim_minimize_both_fine(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) - # elif self.optim == 'max_likelihood': - # current_prevalence, current_bandwidth = self.optim_minimize_like(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes) # check converngece prev_convergence = all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)) @@ -256,3 +254,73 @@ def optim_minimize(loss: Callable, init_prev: np.ndarray, return_loss=False): else: return r.x + + +class KDEyMLauto2(KDEyML): + + def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500, target='likelihood'): + """ + reduction: number of examples per class for automatically setting the bandwidth + """ + self.classifier = qp._get_classifier(classifier) + self.val_split = val_split + if bandwidth == 'auto': + self.bandwidth = bandwidth + else: + self.bandwidth = KDEBase._check_bandwidth(bandwidth) + self.reduction = reduction + self.max_reduced = max_reduced + self.random_state = random_state + assert target == 'likelihood' or target in qp.error.QUANTIFICATION_ERROR_NAMES, 'unknown target for auto' + self.target = target + + def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + if self.bandwidth == 'auto': + self.auto_bandwidth_likelihood(classif_predictions) + else: + self.bandwidth_ = self.bandwidth + self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth_) + return self + + def auto_bandwidth_likelihood(self, classif_predictions: LabelledCollection): + n_classes = classif_predictions.n_classes + + train, val = classif_predictions.split_stratified(train_prop=0.5, random_state=self.random_state) + + if self.reduction is not None: + # reduce samples to speed up computation + tr_length = min(self.reduction * n_classes, self.max_reduced) + if len(train) > tr_length: + train = train.sampling(tr_length) + + best_band = None + best_loss_val = None + init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + for bandwidth in np.logspace(-4, np.log10(0.2), 20): + mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth) + + repeats = 25 + loss_accum = 0 + prot = UPP(val, sample_size=self.reduction, repeats=repeats, random_state=self.random_state) + for (sample, prev) in tqdm(prot(), total=repeats): + test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities] + + def neg_loglikelihood_prev_(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) + + if self.target == 'likelihood': + loss_fn = neg_loglikelihood_prev_ + else: + loss_fn = lambda prev_hat: qp.error.from_name(self.target)(prev, prev_hat) + + pred_prev, loss_val = optim_minimize(loss_fn, init_prev, return_loss=True) + loss_accum += loss_val + + if best_loss_val is None or loss_accum < best_loss_val: + best_loss_val = loss_accum + best_band = bandwidth + + print(f'found bandwidth={best_band:.4f} (loss_val={best_loss_val:.5f})') + self.bandwidth_ = best_band diff --git a/KDEy/quantification_evaluation.py b/KDEy/quantification_evaluation.py index ecdde23..fd98dbd 100644 --- a/KDEy/quantification_evaluation.py +++ b/KDEy/quantification_evaluation.py @@ -7,7 +7,7 @@ import numpy as np from sklearn.linear_model import LogisticRegression import quapy as qp -from KDEy.kdey_devel import KDEyMLauto +from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2 from quapy.method.aggregative import PACC, EMQ, KDEyML from quapy.model_selection import GridSearchQ from quapy.protocol import UPP @@ -22,8 +22,8 @@ def newLR(): # typical hyperparameters explored for Logistic Regression logreg_grid = { - 'C': [1], - 'class_weight': [None] + 'C': np.logspace(-3,3,7), + 'class_weight': [None, 'balanced'] } @@ -34,7 +34,12 @@ def wrap_hyper(classifier_hyper_grid: dict): METHODS = [ ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)), - ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-3, 0.5, 50)}}), + ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}), + ('KDEy-ML-scott', KDEyML(newLR(), bandwidth='scott'), wrap_hyper(logreg_grid)), + ('KDEy-ML-silver', KDEyML(newLR(), bandwidth='silverman'), wrap_hyper(logreg_grid)), + ('KDEy-ML-autoLike', KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood'), wrap_hyper(logreg_grid)), + ('KDEy-ML-autoAE', KDEyMLauto2(newLR(), bandwidth='auto', target='mae'), wrap_hyper(logreg_grid)), + ('KDEy-ML-autoRAE', KDEyMLauto2(newLR(), bandwidth='auto', target='mrae'), wrap_hyper(logreg_grid)), ] @@ -49,8 +54,8 @@ TRANSDUCTIVE_METHODS = [ # ('TKDEy-MLboth', KDEyMLauto(newLR(), optim='both'), None), # ('TKDEy-MLbothfine', KDEyMLauto(newLR(), optim='both_fine'), None), # ('TKDEy-ML2', KDEyMLauto(newLR()), None), - ('TKDEy-MLike', KDEyMLauto(newLR(), optim='max_likelihood'), None), - ('TKDEy-MLike2', KDEyMLauto(newLR(), optim='max_likelihood2'), None), + # ('TKDEy-MLike', KDEyMLauto(newLR(), optim='max_likelihood'), None), + # ('TKDEy-MLike2', KDEyMLauto(newLR(), optim='max_likelihood2'), None), #('TKDEy-ML3', KDEyMLauto(newLR()), None), #('TKDEy-ML4', KDEyMLauto(newLR()), None), ] @@ -111,23 +116,27 @@ if __name__ == '__main__': transductive_names = [name for (name, *_) in TRANSDUCTIVE_METHODS] if method_name not in transductive_names: - # model selection (train) - train, val = train.split_stratified(random_state=SEED) - protocol = UPP(val, repeats=n_bags_val) - modsel = GridSearchQ( - quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae' - ) - t_init = time() - try: - modsel.fit(train) - print(f'best params {modsel.best_params_}') - print(f'best score {modsel.best_score_}') - quantifier = modsel.best_model() - except: - print('something went wrong... trying to fit the default model') + if len(param_grid) == 0: + t_init = time() quantifier.fit(train) - train_time = time() - t_init - + train_time = time() - t_init + else: + # model selection (train) + train, val = train.split_stratified(random_state=SEED) + protocol = UPP(val, repeats=n_bags_val) + modsel = GridSearchQ( + quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae' + ) + t_init = time() + try: + modsel.fit(train) + print(f'best params {modsel.best_params_}') + print(f'best score {modsel.best_score_}') + quantifier = modsel.best_model() + except: + print('something went wrong... trying to fit the default model') + quantifier.fit(train) + train_time = time() - t_init else: # transductive t_init = time() diff --git a/KDEy/quantification_evaluation_debug.py b/KDEy/quantification_evaluation_debug.py index 3b4e917..ed2e438 100644 --- a/KDEy/quantification_evaluation_debug.py +++ b/KDEy/quantification_evaluation_debug.py @@ -26,8 +26,8 @@ qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE epsilon = 1e-10 # n_bags_test = 2 -DATASETS = [qp.datasets.UCI_MULTICLASS_DATASETS[21]] -# DATASETS = qp.datasets.UCI_MULTICLASS_DATASETS +# DATASETS = [qp.datasets.UCI_MULTICLASS_DATASETS[21]] +DATASETS = qp.datasets.UCI_MULTICLASS_DATASETS for i, dataset in enumerate(DATASETS): data = qp.datasets.fetch_UCIMulticlassDataset(dataset) n_classes = data.n_classes @@ -99,8 +99,8 @@ for i, dataset in enumerate(DATASETS): # Pintar las series ae_error, rae_error, y kld_error en el primer eje Y ax1.plot(xaxis, ae_error, label='AE Error', marker='o', color='b') - ax1.plot(xaxis, rae_error, label='RAE Error', marker='s', color='g') - ax1.plot(xaxis, kld_error, label='KLD Error', marker='^', color='r') + # ax1.plot(xaxis, rae_error, label='RAE Error', marker='s', color='g') + # ax1.plot(xaxis, kld_error, label='KLD Error', marker='^', color='r') ax1.plot(xaxis, mse_error, label='MSE Error', marker='^', color='c') ax1.set_xscale('log') @@ -124,7 +124,7 @@ for i, dataset in enumerate(DATASETS): plt.title('Error Metrics vs Bandwidth') # plt.show() os.makedirs('./plots/likelihood/', exist_ok=True) - plt.savefig(f'./plots/likelihood/fig{it}.png') + plt.savefig(f'./plots/likelihood/{dataset}-fig{it}.png') plt.close() diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index f2b9465..fa75197 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -70,7 +70,7 @@ class KDEBase: if selX.size == 0: selX = [F.uniform_prevalence(len(classes))] class_cond_X.append(selX) - if isinstance(bandwidth, float): + if isinstance(bandwidth, float) or isinstance(bandwidth, str): bandwidth = np.full(fill_value=bandwidth, shape=(len(classes),)) return [self.get_kde_function(X_cond_yi, band_i) for X_cond_yi, band_i in zip(class_cond_X, bandwidth)]