This commit is contained in:
Alejandro Moreo Fernandez 2024-09-18 10:33:58 +02:00
parent 6ce5eea4f2
commit 9fb208fe4c
2 changed files with 80 additions and 69 deletions

View File

@ -1,16 +1,16 @@
import os import os
import pickle import pickle
import shutil
import numpy as np import numpy as np
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from os.path import join from os.path import join
import quapy as qp import quapy as qp
from quapy.protocol import UPP from quapy.protocol import UPP
from kdey_devel import KDEyML from kdey_devel import KDEyML
from utils import measuretime
DEBUG = True
DEBUG = False
qp.environ["SAMPLE_SIZE"] = 100 if DEBUG else 500 qp.environ["SAMPLE_SIZE"] = 100 if DEBUG else 500
val_repeats = 100 if DEBUG else 500 val_repeats = 100 if DEBUG else 500
@ -23,20 +23,24 @@ val_choice = {}
bandwidth_range = np.linspace(0.01, 0.20, 20) bandwidth_range = np.linspace(0.01, 0.20, 20)
if DEBUG: if DEBUG:
bandwidth_range = np.linspace(0.01, 0.20, 10) bandwidth_range = np.linspace(0.01, 0.20, 5)
def datasets(): def datasets():
for dataset_name in qp.datasets.UCI_MULTICLASS_DATASETS: dataset_list = qp.datasets.UCI_MULTICLASS_DATASETS
if DEBUG:
dataset_list = dataset_list[:4]
for dataset_name in dataset_list:
dataset = qp.datasets.fetch_UCIMulticlassDataset(dataset_name) dataset = qp.datasets.fetch_UCIMulticlassDataset(dataset_name)
if DEBUG: if DEBUG:
dataset = dataset.reduce(random_state=0) dataset = dataset.reduce(random_state=0)
yield dataset yield dataset
def predict_b_modsel(train): @measuretime
tinit = 0 def predict_b_modsel(dataset):
# bandwidth chosen during model selection in validation # bandwidth chosen during model selection in validation
train = dataset.training
train_tr, train_va = train.split_stratified(random_state=0) train_tr, train_va = train.split_stratified(random_state=0)
kdey = KDEyML(random_state=0) kdey = KDEyML(random_state=0)
modsel = qp.model_selection.GridSearchQ( modsel = qp.model_selection.GridSearchQ(
@ -49,74 +53,73 @@ def predict_b_modsel(train):
).fit(train_tr) ).fit(train_tr)
chosen_bandwidth = modsel.best_params_['bandwidth'] chosen_bandwidth = modsel.best_params_['bandwidth']
modsel_choice = float(chosen_bandwidth) modsel_choice = float(chosen_bandwidth)
tend = # kdey.set_params(bandwidth=chosen_bandwidth)
# kdey.fit(train)
# kdey.qua
return modsel_choice return modsel_choice
def experiment_dataset(dataset):
def in_test_search(dataset, n_jobs=-1):
train, test = dataset.train_test train, test = dataset.train_test
test_gen = UPP(test, repeats=test_repeats)
# bandwidth chosen during model selection in validation
train_tr, train_va = train.split_stratified(random_state=0)
kdey = KDEyML(random_state=0)
modsel = qp.model_selection.GridSearchQ(
model=kdey,
param_grid={'bandwidth': bandwidth_range},
protocol=UPP(train_va, repeats=val_repeats),
refit=False,
n_jobs=-1,
verbose=True
).fit(train_tr)
chosen_bandwidth = modsel.best_params_['bandwidth']
modsel_choice = float(chosen_bandwidth)
# results in test
print(f"testing KDEy in {dataset.name}") print(f"testing KDEy in {dataset.name}")
dataset_results = []
for b in bandwidth_range: def experiment_job(bandwidth):
kdey = KDEyML(bandwidth=b, random_state=0) kdey = KDEyML(bandwidth=bandwidth, random_state=0)
kdey.fit(train) kdey.fit(train)
test_gen = UPP(test, repeats=test_repeats)
mae = qp.evaluation.evaluate(kdey, protocol=test_gen, error_metric='mae', verbose=True) mae = qp.evaluation.evaluate(kdey, protocol=test_gen, error_metric='mae', verbose=True)
print(f'bandwidth={b}: {mae:.5f}') print(f'{bandwidth=}: {mae:.5f}')
dataset_results.append((float(b), float(mae))) return float(mae)
return modsel_choice, dataset_results dataset_results = qp.util.parallel(experiment_job, bandwidth_range, n_jobs=n_jobs)
return dataset_results, bandwidth_range
def plot_bandwidth(val_choice, test_results):
for dataset_name in val_choice.keys():
import matplotlib.pyplot as plt
bandwidths, results = zip(*test_results[dataset_name]) def plot_bandwidth(dataset_name, test_results, bandwidths, triplet_list_results):
import matplotlib.pyplot as plt
print(dataset_name) print("PLOT", dataset_name)
print(bandwidths) print(dataset_name)
print(results)
# Crear la gráfica plt.figure(figsize=(8, 6))
plt.figure(figsize=(8, 6))
# Graficar los puntos de datos # show test results
plt.plot(bandwidths, results, marker='o') plt.plot(bandwidths, test_results, marker='o')
# Agregar la línea vertical en bandwidth_chosen for (method_name, method_choice, method_time) in triplet_list_results:
plt.axvline(x=val_choice[dataset_name], color='r', linestyle='--', label=f'bandwidth mod-sel: {val_choice[dataset_name]}') plt.axvline(x=method_choice, linestyle='--', label=method_name)
# Agregar etiquetas y título # Agregar etiquetas y título
plt.xlabel('Bandwidth') plt.xlabel('Bandwidth')
plt.ylabel('MAE') plt.ylabel('MAE')
plt.title(dataset_name) plt.title(dataset_name)
# Mostrar la leyenda # Mostrar la leyenda
plt.legend() plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# Mostrar la gráfica # Mostrar la gráfica
plt.grid(True) plt.grid(True)
# plt.show()
os.makedirs('./plots', exist_ok=True)
plt.savefig(f'./plots/{dataset_name}.png')
plt.close()
plotdir = './plots'
if DEBUG:
plotdir = './plots_debug'
os.makedirs(plotdir, exist_ok=True)
plt.tight_layout()
plt.savefig(f'{plotdir}/{dataset_name}.png')
plt.close()
def error_table(dataset_name, test_results, bandwidth_range, triplet_list_results):
best_bandwidth = bandwidth_range[np.argmin(test_results)]
print(f'Method\tChoice\tAE\tTime')
for method_name, method_choice, took in triplet_list_results:
if method_choice in bandwidth_range:
index = np.where(bandwidth_range == method_choice)[0][0]
method_score = test_results[index]
else:
method_score = 1
error = np.abs(best_bandwidth-method_score)
print(f'{method_name}\t{method_choice}\t{error}\t{took:.3}s')
for dataset in datasets(): for dataset in datasets():
@ -124,20 +127,25 @@ for dataset in datasets():
print(len(dataset.training)) print(len(dataset.training))
print(len(dataset.test)) print(len(dataset.test))
result_path = f'./results/{dataset.name}/'
if DEBUG: if DEBUG:
result_path = f'./results/debug/{dataset.name}.pkl' result_path = result_path.replace('results', 'results_debug')
else: if os.path.exists(result_path):
result_path = f'./results/{dataset.name}.pkl' shutil.rmtree(result_path)
modsel_choice, dataset_results = qp.util.pickled_resource(result_path, experiment_dataset, dataset) dataset_results, bandwidth_range = qp.util.pickled_resource(join(result_path, 'test.pkl'), in_test_search, dataset)
val_choice[dataset.name] = modsel_choice
test_results[dataset.name] = dataset_results triplet_list_results = []
modsel_choice, modsel_time = qp.util.pickled_resource(join(result_path, 'modsel.pkl'), predict_b_modsel, dataset)
triplet_list_results.append(('modsel', modsel_choice, modsel_time,))
print(f'Dataset = {dataset.name}') print(f'Dataset = {dataset.name}')
print(modsel_choice) print(modsel_choice)
print(dataset_results) print(dataset_results)
plot_bandwidth(val_choice, test_results) plot_bandwidth(dataset.name, dataset_results, bandwidth_range, triplet_list_results)
error_table(dataset.name, dataset_results, bandwidth_range, triplet_list_results)
# time_table(dataset.name, dataset_results, bandwidth_range, triplet_list_results)

View File

@ -4,9 +4,12 @@ from functools import wraps
def measuretime(func): def measuretime(func):
@wraps(func) @wraps(func)
def wrapper(*args, **kwargs): def wrapper(*args, **kwargs):
start_time = time.time() # inicia el contador de tiempo start_time = time.time()
result = func(*args, **kwargs) # ejecuta la función original result = func(*args, **kwargs)
end_time = time.time() # finaliza el contador de tiempo end_time = time.time()
time_it_took = end_time - start_time # calcula el tiempo total time_it_took = end_time - start_time
return result, time_it_took # devuelve el resultado y el tiempo if isinstance(result, tuple):
return (*result, time_it_took)
else:
return result, time_it_took
return wrapper return wrapper