switch

2024-09-18 10:33:58 +02:00 · 2024-09-18 10:33:58 +02:00 · 9fb208fe4c
parent 6ce5eea4f2
commit 9fb208fe4c
2 changed files with 80 additions and 69 deletions
--- a/KDEy/experiments.py
+++ b/KDEy/experiments.py
@ -1,16 +1,16 @@
 import os
 import pickle
-
+import shutil
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 from os.path import join
 import quapy as qp
 from quapy.protocol import UPP
 from kdey_devel import KDEyML
+from utils import measuretime


-
-DEBUG = False
+DEBUG = True

 qp.environ["SAMPLE_SIZE"] = 100 if DEBUG else 500
 val_repeats  = 100 if DEBUG else 500
@ -23,20 +23,24 @@ val_choice = {}

 bandwidth_range = np.linspace(0.01, 0.20, 20)
 if DEBUG:
-    bandwidth_range = np.linspace(0.01, 0.20, 10)
+    bandwidth_range = np.linspace(0.01, 0.20, 5)


 def datasets():
-    for dataset_name in qp.datasets.UCI_MULTICLASS_DATASETS:
+    dataset_list = qp.datasets.UCI_MULTICLASS_DATASETS
+    if DEBUG:
+        dataset_list = dataset_list[:4]
+    for dataset_name in dataset_list:
        dataset = qp.datasets.fetch_UCIMulticlassDataset(dataset_name)
        if DEBUG:
            dataset = dataset.reduce(random_state=0)
        yield dataset


-def predict_b_modsel(train):
-    tinit = 0
+@measuretime
+def predict_b_modsel(dataset):
    # bandwidth chosen during model selection in validation
+    train = dataset.training
    train_tr, train_va = train.split_stratified(random_state=0)
    kdey = KDEyML(random_state=0)
    modsel = qp.model_selection.GridSearchQ(
@ -49,74 +53,73 @@ def predict_b_modsel(train):
    ).fit(train_tr)
    chosen_bandwidth = modsel.best_params_['bandwidth']
    modsel_choice = float(chosen_bandwidth)
-    tend =
+    # kdey.set_params(bandwidth=chosen_bandwidth)
+    # kdey.fit(train)
+    # kdey.qua
    return modsel_choice

-def experiment_dataset(dataset):
+
+def in_test_search(dataset, n_jobs=-1):
    train, test = dataset.train_test
-    test_gen = UPP(test, repeats=test_repeats)

-    # bandwidth chosen during model selection in validation
-    train_tr, train_va = train.split_stratified(random_state=0)
-    kdey = KDEyML(random_state=0)
-    modsel = qp.model_selection.GridSearchQ(
-        model=kdey,
-        param_grid={'bandwidth': bandwidth_range},
-        protocol=UPP(train_va, repeats=val_repeats),
-        refit=False,
-        n_jobs=-1,
-        verbose=True
-    ).fit(train_tr)
-    chosen_bandwidth = modsel.best_params_['bandwidth']
-    modsel_choice = float(chosen_bandwidth)
-
-    # results in test
    print(f"testing KDEy in {dataset.name}")
-    dataset_results = []
-    for b in bandwidth_range:
-        kdey = KDEyML(bandwidth=b, random_state=0)
+
+    def experiment_job(bandwidth):
+        kdey = KDEyML(bandwidth=bandwidth, random_state=0)
        kdey.fit(train)
-
+        test_gen = UPP(test, repeats=test_repeats)
        mae = qp.evaluation.evaluate(kdey, protocol=test_gen, error_metric='mae', verbose=True)
-        print(f'bandwidth={b}: {mae:.5f}')
-        dataset_results.append((float(b), float(mae)))
+        print(f'{bandwidth=}: {mae:.5f}')
+        return float(mae)

-    return modsel_choice, dataset_results
+    dataset_results = qp.util.parallel(experiment_job, bandwidth_range, n_jobs=n_jobs)
+    return dataset_results, bandwidth_range

-def plot_bandwidth(val_choice, test_results):
-    for dataset_name in val_choice.keys():
-        import matplotlib.pyplot as plt

-        bandwidths, results = zip(*test_results[dataset_name])
+def plot_bandwidth(dataset_name, test_results, bandwidths, triplet_list_results):
+    import matplotlib.pyplot as plt

-        print(dataset_name)
-        print(bandwidths)
-        print(results)
+    print("PLOT", dataset_name)
+    print(dataset_name)

-        # Crear la gráfica
-        plt.figure(figsize=(8, 6))
+    plt.figure(figsize=(8, 6))

-        # Graficar los puntos de datos
-        plt.plot(bandwidths, results, marker='o')
+    # show test results
+    plt.plot(bandwidths, test_results, marker='o')

-        # Agregar la línea vertical en bandwidth_chosen
-        plt.axvline(x=val_choice[dataset_name], color='r', linestyle='--', label=f'bandwidth mod-sel: {val_choice[dataset_name]}')
+    for (method_name, method_choice, method_time) in triplet_list_results:
+        plt.axvline(x=method_choice, linestyle='--', label=method_name)

-        # Agregar etiquetas y título
-        plt.xlabel('Bandwidth')
-        plt.ylabel('MAE')
-        plt.title(dataset_name)
+    # Agregar etiquetas y título
+    plt.xlabel('Bandwidth')
+    plt.ylabel('MAE')
+    plt.title(dataset_name)

-        # Mostrar la leyenda
-        plt.legend()
+    # Mostrar la leyenda
+    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

-        # Mostrar la gráfica
-        plt.grid(True)
-        # plt.show()
-        os.makedirs('./plots', exist_ok=True)
-        plt.savefig(f'./plots/{dataset_name}.png')
-        plt.close()
+    # Mostrar la gráfica
+    plt.grid(True)

+    plotdir = './plots'
+    if DEBUG:
+        plotdir = './plots_debug'
+    os.makedirs(plotdir, exist_ok=True)
+    plt.tight_layout()
+    plt.savefig(f'{plotdir}/{dataset_name}.png')
+    plt.close()
+
+def error_table(dataset_name, test_results, bandwidth_range, triplet_list_results):
+    best_bandwidth = bandwidth_range[np.argmin(test_results)]
+    print(f'Method\tChoice\tAE\tTime')
+    for method_name, method_choice, took in triplet_list_results:
+        if method_choice in bandwidth_range:
+            index = np.where(bandwidth_range == method_choice)[0][0]
+            method_score = test_results[index]
+        else:
+            method_score = 1
+        error = np.abs(best_bandwidth-method_score)
+        print(f'{method_name}\t{method_choice}\t{error}\t{took:.3}s')


 for dataset in datasets():
@ -124,20 +127,25 @@ for dataset in datasets():
    print(len(dataset.training))
    print(len(dataset.test))

+    result_path = f'./results/{dataset.name}/'
    if DEBUG:
-        result_path = f'./results/debug/{dataset.name}.pkl'
-    else:
-        result_path = f'./results/{dataset.name}.pkl'
+        result_path = result_path.replace('results', 'results_debug')
+        if os.path.exists(result_path):
+            shutil.rmtree(result_path)

-    modsel_choice, dataset_results = qp.util.pickled_resource(result_path, experiment_dataset, dataset)
-    val_choice[dataset.name] = modsel_choice
-    test_results[dataset.name] = dataset_results
+    dataset_results, bandwidth_range = qp.util.pickled_resource(join(result_path, 'test.pkl'), in_test_search, dataset)
+
+    triplet_list_results = []
+    modsel_choice, modsel_time = qp.util.pickled_resource(join(result_path, 'modsel.pkl'), predict_b_modsel, dataset)
+    triplet_list_results.append(('modsel', modsel_choice, modsel_time,))

    print(f'Dataset = {dataset.name}')
    print(modsel_choice)
    print(dataset_results)

-plot_bandwidth(val_choice, test_results)
+    plot_bandwidth(dataset.name, dataset_results, bandwidth_range, triplet_list_results)
+    error_table(dataset.name, dataset_results, bandwidth_range, triplet_list_results)
+    # time_table(dataset.name, dataset_results, bandwidth_range, triplet_list_results)



--- a/KDEy/utils.py
+++ b/KDEy/utils.py
@ -4,9 +4,12 @@ from functools import wraps
 def measuretime(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
-        start_time = time.time()  # inicia el contador de tiempo
-        result = func(*args, **kwargs)  # ejecuta la función original
-        end_time = time.time()  # finaliza el contador de tiempo
-        time_it_took = end_time - start_time  # calcula el tiempo total
-        return result, time_it_took  # devuelve el resultado y el tiempo
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        time_it_took = end_time - start_time
+        if isinstance(result, tuple):
+            return (*result, time_it_took)
+        else:
+            return result, time_it_took
    return wrapper