cleaning last experiments for report

2024-10-17 12:28:24 +02:00 · 2024-10-17 12:28:24 +02:00 · 10595246a9
parent 9aecdad66f
commit 10595246a9
6 changed files with 203 additions and 39 deletions
--- a/KDEy/_experiments_depr.py
+++ b/KDEy/_experiments_depr.py
--- a/KDEy/quantification_evaluation_debug.py
+++ b/KDEy/quantification_evaluation_debug.py
@ -74,12 +74,11 @@ def plot(xaxis, metrics_measurements, metrics_names, suffix):
    plt.close()
-def plot_stack(xaxis, metrics_measurements, metrics_names, suffix):
+def plot_stack(xaxis, metrics_measurements, metrics_names, figname):
-    # Crear la figura y los ejes (4 bloques verticales)
+    n_measures = len(metrics_measurements)//2
    fig, axs = plt.subplots(4, 1, figsize=(8, 12))
-    x = xaxis
+    fig, axs = plt.subplots(n_measures, 1, figsize=(8, 3*n_measures))
    indexes = np.arange(len(metrics_measurements))
    axs_idx = 0
@ -105,6 +104,9 @@ def plot_stack(xaxis, metrics_measurements, metrics_names, suffix):
        # axs[axs_idx].set_title(f'{metric_te_name} and {metric_tr_name}')
        axs[axs_idx].legend(loc='lower right')
        axs[axs_idx].set_xscale('log')
        if axs_idx==0:
            axs[axs_idx].set_title(dataset)
        if axs_idx < len(indexes)//2 -1:
            axs[axs_idx].set_xticks([])
@ -120,7 +122,7 @@ def plot_stack(xaxis, metrics_measurements, metrics_names, suffix):
    # plt.show()
    os.makedirs('./plots/likelihood/', exist_ok=True)
-    plt.savefig(f'./plots/likelihood/{dataset}-fig{suffix}.png')
+    plt.savefig(f'./plots/likelihood/{figname}.png')
    plt.close()
@ -199,7 +201,7 @@ qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE
 show_ae = True
 show_rae = True
 show_mse = False
-show_kld = True
+show_kld = False
 normalize = True
 epsilon = 1e-10
@ -259,7 +261,7 @@ for i, dataset in enumerate(tqdm(DATASETS, desc='processing datasets', total=len
    # measurement_names.append('NLL(te)')
    # measurement_names.append('NLL(tr)')
    # plot(xaxis, measurements, measurement_names, suffix='AVEtr')
-    plot_stack(xaxis, measurements, measurement_names, suffix='AVEtr')
+    plot_stack(xaxis, measurements, measurement_names, figname=f'{i}.png')
--- a/KDEy/gen_tables.py
+++ b/KDEy/gen_tables.py
@ -0,0 +1,163 @@
 import pickle
 import os
 from time import time
 from collections import defaultdict
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
 from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2
 from quapy.method.aggregative import PACC, EMQ, KDEyML
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 from pathlib import Path
 from result_table.src.table import Table
 SEED = 1
 def newLR():
    return LogisticRegression(max_iter=3000)
 # typical hyperparameters explored for Logistic Regression
 logreg_grid = {
    'C': np.logspace(-4,4,9),
    'class_weight': [None, 'balanced']
 }
 def wrap_hyper(classifier_hyper_grid: dict):
    return {'classifier__' + k: v for k, v in classifier_hyper_grid.items()}
 METHODS = [
    # ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
    # ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),/
    ('KDEy',  KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
    # ('KDEy-MLred',  KDEyMLred(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
    ('KDEy-scott', KDEyML(newLR(), bandwidth='scott'), wrap_hyper(logreg_grid)),
    ('KDEy-silver', KDEyML(newLR(), bandwidth='silverman'), wrap_hyper(logreg_grid)),
    ('KDEy-NLL',  KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood', search='grid'), wrap_hyper(logreg_grid)),
    ('KDEy-NLL+',  KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood', search='optim'), wrap_hyper(logreg_grid)),
    ('KDEy-AE',  KDEyMLauto2(newLR(), bandwidth='auto', target='mae', search='grid'), wrap_hyper(logreg_grid)),
    ('KDEy-AE+',  KDEyMLauto2(newLR(), bandwidth='auto', target='mae', search='optim'), wrap_hyper(logreg_grid)),
    ('KDEy-RAE',  KDEyMLauto2(newLR(), bandwidth='auto', target='mrae', search='grid'), wrap_hyper(logreg_grid)),
    ('KDEy-RAE+',  KDEyMLauto2(newLR(), bandwidth='auto', target='mrae', search='optim'), wrap_hyper(logreg_grid)),
 ]
 """
 TKDEyML era primero bandwidth (init 0.05) y luego prevalence (init uniform)
 TKDEyML2 era primero prevalence (init uniform) y luego bandwidth (init 0.05)
 TKDEyML3 era primero prevalence (init uniform) y luego bandwidth (init 0.1)
 TKDEyML4 es como ML2 pero max 5 iteraciones por optimización 
 """
 TRANSDUCTIVE_METHODS = [
    #('TKDEy-ML',  KDEyMLauto(newLR()), None),
    # ('TKDEy-both',  KDEyMLauto(newLR(), optim='both'), None),
    # ('TKDEy-bothfine',  KDEyMLauto(newLR(), optim='both_fine'), None),
    # ('TKDEy-two',  KDEyMLauto(newLR(), optim='two_steps'), None),
    # ('TKDEy-MLike',  KDEyMLauto(newLR(), optim='max_likelihood'), None),
    # ('TKDEy-MLike2',  KDEyMLauto(newLR(), optim='max_likelihood2'), None),
    #('TKDEy-ML3',  KDEyMLauto(newLR()), None),
    #('TKDEy-ML4',  KDEyMLauto(newLR()), None),
 ]
 def show_results(result_path, tables, tables_path='./tables/main.pdf'):
    import pandas as pd
    df = pd.read_csv(result_path + '.csv', sep='\t')
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.width', 1000)  # Ajustar el ancho máximo
    pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE"], margins=True)
    print(pv)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["MRAE"], margins=True)
    print(pv)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["KLD"], margins=True)
    print(pv)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["TR-TIME"], margins=True)
    print(pv)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["TE-TIME"], margins=True)
    print(pv)
    os.makedirs(Path(tables_path).parent, exist_ok=True)
    tables= [table for table in tables.values()]
    method_replace = {
        'KDEy': 'KDEy(orig)',
        'KDEy-scott': 'Scott',
        'KDEy-silver': 'Silver',
        'KDEy-NLL': 'NLL(grid)',
        'KDEy-NLL+': 'NLL(search)',
        'KDEy-AE': 'AE(grid)',
        'KDEy-AE+': 'AE(search)',
        'KDEy-RAE': 'RAE(grid)',
        'KDEy-RAE+': 'RAE(search)',
    }
    Table.LatexPDF(tables_path, tables, method_replace=method_replace, verbose=True, clean=False)
 def collect_results(method_name, tables):
    print('Init method', method_name)
    with open(global_result_path + '.csv', 'at') as csv:
        for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
            print('init', dataset)
            # run_experiment(global_result_path, method_name, quantifier, param_grid, dataset)
            local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe')
            if os.path.exists(local_result_path):
                print(f'result file {local_result_path} already exist; skipping')
                report = qp.util.load_report(local_result_path)
                for metric, table in tables.items():
                    add_column = metric in ['tr_time', 'te_time']
                    if not add_column:
                        add_column = (metric=='mrae' and '-AE' not in method_name) or (metric=='mae' and '-RAE' not in method_name)
                    if add_column:
                        tables[metric].add(benchmark=dataset, method=method_name, v=report[metric])
                # tables['mrae'].add(benchmark=dataset, method=method_name, v=report['mrae'])
            else:
                continue
            means = report.mean(numeric_only=True)
            csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\t{means["tr_time"]:.3f}\t{means["te_time"]:.3f}\n')
            csv.flush()
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 500
    qp.environ['N_JOBS'] = -1
    n_bags_val = 100
    n_bags_test = 500
    result_dir = f'results_quantification/ucimulti'
    os.makedirs(result_dir, exist_ok=True)
    tables = {
        'mae': Table('inductive-mae'),
        'mrae': Table('inductive-mrae'),
        'tr_time': Table('inductive-tr-time'),
        # 'te_time': Table('inductive-te-time'),
    }
    tables['tr_time'].format.show_std = False
    # tables['te_time'].format.show_std = False
    global_result_path = f'{result_dir}/allmethods'
    with open(global_result_path + '.csv', 'wt') as csv:
        csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\tTR-TIME\tTE-TIME\n')
    for method_name, _, _ in METHODS + TRANSDUCTIVE_METHODS:
        collect_results(method_name, tables)
    show_results(global_result_path, tables)
--- a/KDEy/kdey_devel.py
+++ b/KDEy/kdey_devel.py
@ -40,7 +40,7 @@ class KDEyMLauto(KDEyML):
        current_bandwidth = 0.05
        if self.optim == 'both_fine':
            current_bandwidth = np.full(fill_value=current_bandwidth, shape=(n_classes,))
-        current_prevalence = np.full(fill_value=1 / n_classes, shape=(n_classes,))
+        current_prevalence = F.uniform_prevalence(n_classes=n_classes)
        if self.optim == 'max_likelihood':
            current_prevalence, current_bandwidth = self.optim_minimize_like(tr_posteriors, tr_y, te_posteriors, classes, grid=True)
@ -107,9 +107,9 @@ class KDEyMLauto(KDEyML):
        # bounds = [(0.00001, 0.2)]
        # r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds)
-        r = optimize.minimize_scalar(neg_loglikelihood_bandwidth, bounds=(0.00001, 0.2))
+        r = optimize.minimize_scalar(neg_loglikelihood_bandwidth, bounds=(0.0001, 0.2), options={'xatol': 0.005})
        # print(f'iterations-bandwidth={r.nit}')
-        assert r.success, f'Process did not converge! {r.message}'
+        # assert r.success, f'Process did not converge! {r.message}'
        return r.x
    def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
@ -128,7 +128,7 @@ class KDEyMLauto(KDEyML):
        prevalence_bandwidth = np.append(current_prev, current_bandwidth)
        r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
        print(f'iterations-both={r.nit}')
-        assert r.success, 'Process did not converge!'
+        # assert r.success, 'Process did not converge!'
        prev_band = r.x
        current_prevalence = prev_band[:-1]
        current_bandwidth = prev_band[-1]
@ -145,12 +145,12 @@ class KDEyMLauto(KDEyML):
            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
            return -np.sum(test_loglikelihood)
-        bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1) for _ in range(n_classes)]
+        bounds = [(0, 1) for _ in range(n_classes)] + [(0.0001, 0.2) for _ in range(n_classes)]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
        prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth))
        r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
        print(f'iterations-both-fine={r.nit}')
-        assert r.success, 'Process did not converge!'
+        # assert r.success, 'Process did not converge!'
        prev_band = r.x
        current_prevalence = prev_band[:n_classes]
        current_bandwidth = prev_band[n_classes:]
@ -198,7 +198,7 @@ class KDEyMLauto(KDEyML):
        best_like = None
        best_prev = None
        init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,))
-        for bandwidth in np.logspace(-4, 0.5, 50):
+        for bandwidth in np.logspace(-4, np.log10(0.2), 50):
            mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
            test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
@ -239,7 +239,7 @@ class KDEyMLauto(KDEyML):
        r = optimize.minimize(neglikelihood_band, x0=[0.001], method='SLSQP', bounds=bounds)
        best_band = r.x[0]
-        assert r.success, 'Process did not converge!'
+        # assert r.success, 'Process did not converge!'
        print(f'solved in nit={r.nit}')
        return best_band
@ -333,11 +333,10 @@ class KDEyMLauto2(KDEyML):
            return loss_accum
        if self.search == 'optim':
-            r = optimize.minimize_scalar(eval_bandwidth, bounds=(0.001, 0.2), options={'xatol': 0.005})
+            r = optimize.minimize_scalar(eval_bandwidth, bounds=(0.0001, 0.2), options={'xatol': 0.005})
            best_band = r.x
            best_loss_value = r.fun
            nit = r.nit
            # assert r.success, 'Process did not converge!'
        elif self.search=='grid':
            nit=20
@ -348,20 +347,20 @@ class KDEyMLauto2(KDEyML):
        self.bandwidth_ = best_band
-class KDEyMLred(KDEyML):
+# class KDEyMLred(KDEyML):
-    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500):
+#     def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500):
-        self.classifier = qp._get_classifier(classifier)
+#         self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
+#         self.val_split = val_split
-        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
+#         self.bandwidth = KDEBase._check_bandwidth(bandwidth)
-        self.reduction = reduction
+#         self.reduction = reduction
-        self.max_reduced = max_reduced
+#         self.max_reduced = max_reduced
-        self.random_state = random_state
+#         self.random_state = random_state
-
+#
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+#     def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        n_classes = classif_predictions.n_classes
+#         n_classes = classif_predictions.n_classes
-        tr_length = min(self.reduction * n_classes, self.max_reduced)
+#         tr_length = min(self.reduction * n_classes, self.max_reduced)
-        if len(classif_predictions) > tr_length:
+#         if len(classif_predictions) > tr_length:
-            classif_predictions = classif_predictions.sampling(tr_length)
+#             classif_predictions = classif_predictions.sampling(tr_length)
-        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
+#         self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
-        return self
+#         return self
--- a/KDEy/quantification_evaluation.py
+++ b/KDEy/quantification_evaluation.py
@ -7,7 +7,7 @@ import numpy as np
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
-from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2, KDEyMLred
+from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2
 from quapy.method.aggregative import PACC, EMQ, KDEyML
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
@ -32,7 +32,7 @@ def wrap_hyper(classifier_hyper_grid: dict):
 METHODS = [
-    ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
+    # ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
    ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),
    ('KDEy',  KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
    # ('KDEy-MLred',  KDEyMLred(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
@ -55,9 +55,9 @@ TKDEyML4 es como ML2 pero max 5 iteraciones por optimización
 """
 TRANSDUCTIVE_METHODS = [
    #('TKDEy-ML',  KDEyMLauto(newLR()), None),
-    # ('TKDEy-MLboth',  KDEyMLauto(newLR(), optim='both'), None),
+    ('TKDEy-both',  KDEyMLauto(newLR(), optim='both'), None),
-    # ('TKDEy-MLbothfine',  KDEyMLauto(newLR(), optim='both_fine'), None),
+    ('TKDEy-bothfine',  KDEyMLauto(newLR(), optim='both_fine'), None),
-    # ('TKDEy-ML2',  KDEyMLauto(newLR(), optim='two_steps'), None),
+    ('TKDEy-two',  KDEyMLauto(newLR(), optim='two_steps'), None),
    # ('TKDEy-MLike',  KDEyMLauto(newLR(), optim='max_likelihood'), None),
    # ('TKDEy-MLike2',  KDEyMLauto(newLR(), optim='max_likelihood2'), None),
    #('TKDEy-ML3',  KDEyMLauto(newLR()), None),
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit c223c9f1fe3c9708e8c5a5c56e438cdaaa857be4
+Subproject commit 52547b253e906b8ae8d5ae3df77dafe72fac6902
		`@ -1 +1 @@`
			`Subproject commit c223c9f1fe3c9708e8c5a5c56e438cdaaa857be4`				`Subproject commit 52547b253e906b8ae8d5ae3df77dafe72fac6902`