cleaning last experiments for report

This commit is contained in:
Alejandro Moreo Fernandez 2024-10-17 12:28:24 +02:00
parent 9aecdad66f
commit 10595246a9
6 changed files with 203 additions and 39 deletions

View File

@ -74,12 +74,11 @@ def plot(xaxis, metrics_measurements, metrics_names, suffix):
plt.close() plt.close()
def plot_stack(xaxis, metrics_measurements, metrics_names, suffix): def plot_stack(xaxis, metrics_measurements, metrics_names, figname):
# Crear la figura y los ejes (4 bloques verticales) n_measures = len(metrics_measurements)//2
fig, axs = plt.subplots(4, 1, figsize=(8, 12))
x = xaxis fig, axs = plt.subplots(n_measures, 1, figsize=(8, 3*n_measures))
indexes = np.arange(len(metrics_measurements)) indexes = np.arange(len(metrics_measurements))
axs_idx = 0 axs_idx = 0
@ -105,6 +104,9 @@ def plot_stack(xaxis, metrics_measurements, metrics_names, suffix):
# axs[axs_idx].set_title(f'{metric_te_name} and {metric_tr_name}') # axs[axs_idx].set_title(f'{metric_te_name} and {metric_tr_name}')
axs[axs_idx].legend(loc='lower right') axs[axs_idx].legend(loc='lower right')
axs[axs_idx].set_xscale('log')
if axs_idx==0:
axs[axs_idx].set_title(dataset)
if axs_idx < len(indexes)//2 -1: if axs_idx < len(indexes)//2 -1:
axs[axs_idx].set_xticks([]) axs[axs_idx].set_xticks([])
@ -120,7 +122,7 @@ def plot_stack(xaxis, metrics_measurements, metrics_names, suffix):
# plt.show() # plt.show()
os.makedirs('./plots/likelihood/', exist_ok=True) os.makedirs('./plots/likelihood/', exist_ok=True)
plt.savefig(f'./plots/likelihood/{dataset}-fig{suffix}.png') plt.savefig(f'./plots/likelihood/{figname}.png')
plt.close() plt.close()
@ -199,7 +201,7 @@ qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE
show_ae = True show_ae = True
show_rae = True show_rae = True
show_mse = False show_mse = False
show_kld = True show_kld = False
normalize = True normalize = True
epsilon = 1e-10 epsilon = 1e-10
@ -259,7 +261,7 @@ for i, dataset in enumerate(tqdm(DATASETS, desc='processing datasets', total=len
# measurement_names.append('NLL(te)') # measurement_names.append('NLL(te)')
# measurement_names.append('NLL(tr)') # measurement_names.append('NLL(tr)')
# plot(xaxis, measurements, measurement_names, suffix='AVEtr') # plot(xaxis, measurements, measurement_names, suffix='AVEtr')
plot_stack(xaxis, measurements, measurement_names, suffix='AVEtr') plot_stack(xaxis, measurements, measurement_names, figname=f'{i}.png')

163
KDEy/gen_tables.py Normal file
View File

@ -0,0 +1,163 @@
import pickle
import os
from time import time
from collections import defaultdict
import numpy as np
from sklearn.linear_model import LogisticRegression
import quapy as qp
from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2
from quapy.method.aggregative import PACC, EMQ, KDEyML
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from pathlib import Path
from result_table.src.table import Table
SEED = 1
def newLR():
return LogisticRegression(max_iter=3000)
# typical hyperparameters explored for Logistic Regression
logreg_grid = {
'C': np.logspace(-4,4,9),
'class_weight': [None, 'balanced']
}
def wrap_hyper(classifier_hyper_grid: dict):
return {'classifier__' + k: v for k, v in classifier_hyper_grid.items()}
METHODS = [
# ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
# ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),/
('KDEy', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
# ('KDEy-MLred', KDEyMLred(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
('KDEy-scott', KDEyML(newLR(), bandwidth='scott'), wrap_hyper(logreg_grid)),
('KDEy-silver', KDEyML(newLR(), bandwidth='silverman'), wrap_hyper(logreg_grid)),
('KDEy-NLL', KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood', search='grid'), wrap_hyper(logreg_grid)),
('KDEy-NLL+', KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood', search='optim'), wrap_hyper(logreg_grid)),
('KDEy-AE', KDEyMLauto2(newLR(), bandwidth='auto', target='mae', search='grid'), wrap_hyper(logreg_grid)),
('KDEy-AE+', KDEyMLauto2(newLR(), bandwidth='auto', target='mae', search='optim'), wrap_hyper(logreg_grid)),
('KDEy-RAE', KDEyMLauto2(newLR(), bandwidth='auto', target='mrae', search='grid'), wrap_hyper(logreg_grid)),
('KDEy-RAE+', KDEyMLauto2(newLR(), bandwidth='auto', target='mrae', search='optim'), wrap_hyper(logreg_grid)),
]
"""
TKDEyML era primero bandwidth (init 0.05) y luego prevalence (init uniform)
TKDEyML2 era primero prevalence (init uniform) y luego bandwidth (init 0.05)
TKDEyML3 era primero prevalence (init uniform) y luego bandwidth (init 0.1)
TKDEyML4 es como ML2 pero max 5 iteraciones por optimización
"""
TRANSDUCTIVE_METHODS = [
#('TKDEy-ML', KDEyMLauto(newLR()), None),
# ('TKDEy-both', KDEyMLauto(newLR(), optim='both'), None),
# ('TKDEy-bothfine', KDEyMLauto(newLR(), optim='both_fine'), None),
# ('TKDEy-two', KDEyMLauto(newLR(), optim='two_steps'), None),
# ('TKDEy-MLike', KDEyMLauto(newLR(), optim='max_likelihood'), None),
# ('TKDEy-MLike2', KDEyMLauto(newLR(), optim='max_likelihood2'), None),
#('TKDEy-ML3', KDEyMLauto(newLR()), None),
#('TKDEy-ML4', KDEyMLauto(newLR()), None),
]
def show_results(result_path, tables, tables_path='./tables/main.pdf'):
import pandas as pd
df = pd.read_csv(result_path + '.csv', sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000) # Ajustar el ancho máximo
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE"], margins=True)
print(pv)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MRAE"], margins=True)
print(pv)
pv = df.pivot_table(index='Dataset', columns="Method", values=["KLD"], margins=True)
print(pv)
pv = df.pivot_table(index='Dataset', columns="Method", values=["TR-TIME"], margins=True)
print(pv)
pv = df.pivot_table(index='Dataset', columns="Method", values=["TE-TIME"], margins=True)
print(pv)
os.makedirs(Path(tables_path).parent, exist_ok=True)
tables= [table for table in tables.values()]
method_replace = {
'KDEy': 'KDEy(orig)',
'KDEy-scott': 'Scott',
'KDEy-silver': 'Silver',
'KDEy-NLL': 'NLL(grid)',
'KDEy-NLL+': 'NLL(search)',
'KDEy-AE': 'AE(grid)',
'KDEy-AE+': 'AE(search)',
'KDEy-RAE': 'RAE(grid)',
'KDEy-RAE+': 'RAE(search)',
}
Table.LatexPDF(tables_path, tables, method_replace=method_replace, verbose=True, clean=False)
def collect_results(method_name, tables):
print('Init method', method_name)
with open(global_result_path + '.csv', 'at') as csv:
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
print('init', dataset)
# run_experiment(global_result_path, method_name, quantifier, param_grid, dataset)
local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe')
if os.path.exists(local_result_path):
print(f'result file {local_result_path} already exist; skipping')
report = qp.util.load_report(local_result_path)
for metric, table in tables.items():
add_column = metric in ['tr_time', 'te_time']
if not add_column:
add_column = (metric=='mrae' and '-AE' not in method_name) or (metric=='mae' and '-RAE' not in method_name)
if add_column:
tables[metric].add(benchmark=dataset, method=method_name, v=report[metric])
# tables['mrae'].add(benchmark=dataset, method=method_name, v=report['mrae'])
else:
continue
means = report.mean(numeric_only=True)
csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\t{means["tr_time"]:.3f}\t{means["te_time"]:.3f}\n')
csv.flush()
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 500
qp.environ['N_JOBS'] = -1
n_bags_val = 100
n_bags_test = 500
result_dir = f'results_quantification/ucimulti'
os.makedirs(result_dir, exist_ok=True)
tables = {
'mae': Table('inductive-mae'),
'mrae': Table('inductive-mrae'),
'tr_time': Table('inductive-tr-time'),
# 'te_time': Table('inductive-te-time'),
}
tables['tr_time'].format.show_std = False
# tables['te_time'].format.show_std = False
global_result_path = f'{result_dir}/allmethods'
with open(global_result_path + '.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\tTR-TIME\tTE-TIME\n')
for method_name, _, _ in METHODS + TRANSDUCTIVE_METHODS:
collect_results(method_name, tables)
show_results(global_result_path, tables)

View File

@ -40,7 +40,7 @@ class KDEyMLauto(KDEyML):
current_bandwidth = 0.05 current_bandwidth = 0.05
if self.optim == 'both_fine': if self.optim == 'both_fine':
current_bandwidth = np.full(fill_value=current_bandwidth, shape=(n_classes,)) current_bandwidth = np.full(fill_value=current_bandwidth, shape=(n_classes,))
current_prevalence = np.full(fill_value=1 / n_classes, shape=(n_classes,)) current_prevalence = F.uniform_prevalence(n_classes=n_classes)
if self.optim == 'max_likelihood': if self.optim == 'max_likelihood':
current_prevalence, current_bandwidth = self.optim_minimize_like(tr_posteriors, tr_y, te_posteriors, classes, grid=True) current_prevalence, current_bandwidth = self.optim_minimize_like(tr_posteriors, tr_y, te_posteriors, classes, grid=True)
@ -107,9 +107,9 @@ class KDEyMLauto(KDEyML):
# bounds = [(0.00001, 0.2)] # bounds = [(0.00001, 0.2)]
# r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds) # r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds)
r = optimize.minimize_scalar(neg_loglikelihood_bandwidth, bounds=(0.00001, 0.2)) r = optimize.minimize_scalar(neg_loglikelihood_bandwidth, bounds=(0.0001, 0.2), options={'xatol': 0.005})
# print(f'iterations-bandwidth={r.nit}') # print(f'iterations-bandwidth={r.nit}')
assert r.success, f'Process did not converge! {r.message}' # assert r.success, f'Process did not converge! {r.message}'
return r.x return r.x
def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes): def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
@ -128,7 +128,7 @@ class KDEyMLauto(KDEyML):
prevalence_bandwidth = np.append(current_prev, current_bandwidth) prevalence_bandwidth = np.append(current_prev, current_bandwidth)
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints) r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-both={r.nit}') print(f'iterations-both={r.nit}')
assert r.success, 'Process did not converge!' # assert r.success, 'Process did not converge!'
prev_band = r.x prev_band = r.x
current_prevalence = prev_band[:-1] current_prevalence = prev_band[:-1]
current_bandwidth = prev_band[-1] current_bandwidth = prev_band[-1]
@ -145,12 +145,12 @@ class KDEyMLauto(KDEyML):
test_loglikelihood = np.log(test_mixture_likelihood + epsilon) test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood) return -np.sum(test_loglikelihood)
bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1) for _ in range(n_classes)] bounds = [(0, 1) for _ in range(n_classes)] + [(0.0001, 0.2) for _ in range(n_classes)]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])}) constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth)) prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth))
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints) r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-both-fine={r.nit}') print(f'iterations-both-fine={r.nit}')
assert r.success, 'Process did not converge!' # assert r.success, 'Process did not converge!'
prev_band = r.x prev_band = r.x
current_prevalence = prev_band[:n_classes] current_prevalence = prev_band[:n_classes]
current_bandwidth = prev_band[n_classes:] current_bandwidth = prev_band[n_classes:]
@ -198,7 +198,7 @@ class KDEyMLauto(KDEyML):
best_like = None best_like = None
best_prev = None best_prev = None
init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,)) init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,))
for bandwidth in np.logspace(-4, 0.5, 50): for bandwidth in np.logspace(-4, np.log10(0.2), 50):
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth) mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities] test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
@ -239,7 +239,7 @@ class KDEyMLauto(KDEyML):
r = optimize.minimize(neglikelihood_band, x0=[0.001], method='SLSQP', bounds=bounds) r = optimize.minimize(neglikelihood_band, x0=[0.001], method='SLSQP', bounds=bounds)
best_band = r.x[0] best_band = r.x[0]
assert r.success, 'Process did not converge!' # assert r.success, 'Process did not converge!'
print(f'solved in nit={r.nit}') print(f'solved in nit={r.nit}')
return best_band return best_band
@ -333,11 +333,10 @@ class KDEyMLauto2(KDEyML):
return loss_accum return loss_accum
if self.search == 'optim': if self.search == 'optim':
r = optimize.minimize_scalar(eval_bandwidth, bounds=(0.001, 0.2), options={'xatol': 0.005}) r = optimize.minimize_scalar(eval_bandwidth, bounds=(0.0001, 0.2), options={'xatol': 0.005})
best_band = r.x best_band = r.x
best_loss_value = r.fun best_loss_value = r.fun
nit = r.nit nit = r.nit
# assert r.success, 'Process did not converge!'
elif self.search=='grid': elif self.search=='grid':
nit=20 nit=20
@ -348,20 +347,20 @@ class KDEyMLauto2(KDEyML):
self.bandwidth_ = best_band self.bandwidth_ = best_band
class KDEyMLred(KDEyML): # class KDEyMLred(KDEyML):
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500): # def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500):
self.classifier = qp._get_classifier(classifier) # self.classifier = qp._get_classifier(classifier)
self.val_split = val_split # self.val_split = val_split
self.bandwidth = KDEBase._check_bandwidth(bandwidth) # self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.reduction = reduction # self.reduction = reduction
self.max_reduced = max_reduced # self.max_reduced = max_reduced
self.random_state = random_state # self.random_state = random_state
#
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): # def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
n_classes = classif_predictions.n_classes # n_classes = classif_predictions.n_classes
tr_length = min(self.reduction * n_classes, self.max_reduced) # tr_length = min(self.reduction * n_classes, self.max_reduced)
if len(classif_predictions) > tr_length: # if len(classif_predictions) > tr_length:
classif_predictions = classif_predictions.sampling(tr_length) # classif_predictions = classif_predictions.sampling(tr_length)
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth) # self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
return self # return self

View File

@ -7,7 +7,7 @@ import numpy as np
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import quapy as qp import quapy as qp
from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2, KDEyMLred from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2
from quapy.method.aggregative import PACC, EMQ, KDEyML from quapy.method.aggregative import PACC, EMQ, KDEyML
from quapy.model_selection import GridSearchQ from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP from quapy.protocol import UPP
@ -32,7 +32,7 @@ def wrap_hyper(classifier_hyper_grid: dict):
METHODS = [ METHODS = [
('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), # ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)), ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),
('KDEy', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}), ('KDEy', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
# ('KDEy-MLred', KDEyMLred(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}), # ('KDEy-MLred', KDEyMLred(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
@ -55,9 +55,9 @@ TKDEyML4 es como ML2 pero max 5 iteraciones por optimización
""" """
TRANSDUCTIVE_METHODS = [ TRANSDUCTIVE_METHODS = [
#('TKDEy-ML', KDEyMLauto(newLR()), None), #('TKDEy-ML', KDEyMLauto(newLR()), None),
# ('TKDEy-MLboth', KDEyMLauto(newLR(), optim='both'), None), ('TKDEy-both', KDEyMLauto(newLR(), optim='both'), None),
# ('TKDEy-MLbothfine', KDEyMLauto(newLR(), optim='both_fine'), None), ('TKDEy-bothfine', KDEyMLauto(newLR(), optim='both_fine'), None),
# ('TKDEy-ML2', KDEyMLauto(newLR(), optim='two_steps'), None), ('TKDEy-two', KDEyMLauto(newLR(), optim='two_steps'), None),
# ('TKDEy-MLike', KDEyMLauto(newLR(), optim='max_likelihood'), None), # ('TKDEy-MLike', KDEyMLauto(newLR(), optim='max_likelihood'), None),
# ('TKDEy-MLike2', KDEyMLauto(newLR(), optim='max_likelihood2'), None), # ('TKDEy-MLike2', KDEyMLauto(newLR(), optim='max_likelihood2'), None),
#('TKDEy-ML3', KDEyMLauto(newLR()), None), #('TKDEy-ML3', KDEyMLauto(newLR()), None),

@ -1 +1 @@
Subproject commit c223c9f1fe3c9708e8c5a5c56e438cdaaa857be4 Subproject commit 52547b253e906b8ae8d5ae3df77dafe72fac6902