cleaning last experiments for report

2024-10-17 12:28:24 +02:00 · 2024-10-17 12:28:24 +02:00 · 10595246a9
parent 9aecdad66f
commit 10595246a9
6 changed files with 203 additions and 39 deletions
--- a/KDEy/_experiments_depr.py
+++ b/KDEy/_experiments_depr.py
--- a/KDEy/quantification_evaluation_debug.py
+++ b/KDEy/quantification_evaluation_debug.py
@ -74,12 +74,11 @@ def plot(xaxis, metrics_measurements, metrics_names, suffix):
    plt.close()


-def plot_stack(xaxis, metrics_measurements, metrics_names, suffix):
+def plot_stack(xaxis, metrics_measurements, metrics_names, figname):

-    # Crear la figura y los ejes (4 bloques verticales)
-    fig, axs = plt.subplots(4, 1, figsize=(8, 12))
+    n_measures = len(metrics_measurements)//2

-    x = xaxis
+    fig, axs = plt.subplots(n_measures, 1, figsize=(8, 3*n_measures))

    indexes = np.arange(len(metrics_measurements))
    axs_idx = 0
@ -105,6 +104,9 @@ def plot_stack(xaxis, metrics_measurements, metrics_names, suffix):

        # axs[axs_idx].set_title(f'{metric_te_name} and {metric_tr_name}')
        axs[axs_idx].legend(loc='lower right')
+        axs[axs_idx].set_xscale('log')
+        if axs_idx==0:
+            axs[axs_idx].set_title(dataset)
        if axs_idx < len(indexes)//2 -1:
            axs[axs_idx].set_xticks([])

@ -120,7 +122,7 @@ def plot_stack(xaxis, metrics_measurements, metrics_names, suffix):
    # plt.show()
    os.makedirs('./plots/likelihood/', exist_ok=True)

-    plt.savefig(f'./plots/likelihood/{dataset}-fig{suffix}.png')
+    plt.savefig(f'./plots/likelihood/{figname}.png')
    plt.close()


@ -199,7 +201,7 @@ qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE
 show_ae = True
 show_rae = True
 show_mse = False
-show_kld = True
+show_kld = False
 normalize = True

 epsilon = 1e-10
@ -259,7 +261,7 @@ for i, dataset in enumerate(tqdm(DATASETS, desc='processing datasets', total=len
    # measurement_names.append('NLL(te)')
    # measurement_names.append('NLL(tr)')
    # plot(xaxis, measurements, measurement_names, suffix='AVEtr')
-    plot_stack(xaxis, measurements, measurement_names, suffix='AVEtr')
+    plot_stack(xaxis, measurements, measurement_names, figname=f'{i}.png')



--- a/KDEy/gen_tables.py
+++ b/KDEy/gen_tables.py
@ -0,0 +1,163 @@
+import pickle
+import os
+from time import time
+from collections import defaultdict
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2
+from quapy.method.aggregative import PACC, EMQ, KDEyML
+from quapy.model_selection import GridSearchQ
+from quapy.protocol import UPP
+from pathlib import Path
+
+from result_table.src.table import Table
+
+SEED = 1
+
+
+def newLR():
+    return LogisticRegression(max_iter=3000)
+
+
+# typical hyperparameters explored for Logistic Regression
+logreg_grid = {
+    'C': np.logspace(-4,4,9),
+    'class_weight': [None, 'balanced']
+}
+
+
+def wrap_hyper(classifier_hyper_grid: dict):
+    return {'classifier__' + k: v for k, v in classifier_hyper_grid.items()}
+
+
+METHODS = [
+    # ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
+    # ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),/
+    ('KDEy',  KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
+    # ('KDEy-MLred',  KDEyMLred(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
+    ('KDEy-scott', KDEyML(newLR(), bandwidth='scott'), wrap_hyper(logreg_grid)),
+    ('KDEy-silver', KDEyML(newLR(), bandwidth='silverman'), wrap_hyper(logreg_grid)),
+    ('KDEy-NLL',  KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood', search='grid'), wrap_hyper(logreg_grid)),
+    ('KDEy-NLL+',  KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood', search='optim'), wrap_hyper(logreg_grid)),
+    ('KDEy-AE',  KDEyMLauto2(newLR(), bandwidth='auto', target='mae', search='grid'), wrap_hyper(logreg_grid)),
+    ('KDEy-AE+',  KDEyMLauto2(newLR(), bandwidth='auto', target='mae', search='optim'), wrap_hyper(logreg_grid)),
+    ('KDEy-RAE',  KDEyMLauto2(newLR(), bandwidth='auto', target='mrae', search='grid'), wrap_hyper(logreg_grid)),
+    ('KDEy-RAE+',  KDEyMLauto2(newLR(), bandwidth='auto', target='mrae', search='optim'), wrap_hyper(logreg_grid)),
+]
+
+
+"""
+TKDEyML era primero bandwidth (init 0.05) y luego prevalence (init uniform)
+TKDEyML2 era primero prevalence (init uniform) y luego bandwidth (init 0.05)
+TKDEyML3 era primero prevalence (init uniform) y luego bandwidth (init 0.1)
+TKDEyML4 es como ML2 pero max 5 iteraciones por optimización 
+"""
+TRANSDUCTIVE_METHODS = [
+    #('TKDEy-ML',  KDEyMLauto(newLR()), None),
+    # ('TKDEy-both',  KDEyMLauto(newLR(), optim='both'), None),
+    # ('TKDEy-bothfine',  KDEyMLauto(newLR(), optim='both_fine'), None),
+    # ('TKDEy-two',  KDEyMLauto(newLR(), optim='two_steps'), None),
+    # ('TKDEy-MLike',  KDEyMLauto(newLR(), optim='max_likelihood'), None),
+    # ('TKDEy-MLike2',  KDEyMLauto(newLR(), optim='max_likelihood2'), None),
+    #('TKDEy-ML3',  KDEyMLauto(newLR()), None),
+    #('TKDEy-ML4',  KDEyMLauto(newLR()), None),
+]
+
+def show_results(result_path, tables, tables_path='./tables/main.pdf'):
+    import pandas as pd
+    df = pd.read_csv(result_path + '.csv', sep='\t')
+    pd.set_option('display.max_columns', None)
+    pd.set_option('display.max_rows', None)
+    pd.set_option('display.width', 1000)  # Ajustar el ancho máximo
+    pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE"], margins=True)
+    print(pv)
+    pv = df.pivot_table(index='Dataset', columns="Method", values=["MRAE"], margins=True)
+    print(pv)
+    pv = df.pivot_table(index='Dataset', columns="Method", values=["KLD"], margins=True)
+    print(pv)
+    pv = df.pivot_table(index='Dataset', columns="Method", values=["TR-TIME"], margins=True)
+    print(pv)
+    pv = df.pivot_table(index='Dataset', columns="Method", values=["TE-TIME"], margins=True)
+    print(pv)
+
+    os.makedirs(Path(tables_path).parent, exist_ok=True)
+    tables= [table for table in tables.values()]
+
+    method_replace = {
+        'KDEy': 'KDEy(orig)',
+        'KDEy-scott': 'Scott',
+        'KDEy-silver': 'Silver',
+        'KDEy-NLL': 'NLL(grid)',
+        'KDEy-NLL+': 'NLL(search)',
+        'KDEy-AE': 'AE(grid)',
+        'KDEy-AE+': 'AE(search)',
+        'KDEy-RAE': 'RAE(grid)',
+        'KDEy-RAE+': 'RAE(search)',
+    }
+
+    Table.LatexPDF(tables_path, tables, method_replace=method_replace, verbose=True, clean=False)
+
+
+def collect_results(method_name, tables):
+
+    print('Init method', method_name)
+
+    with open(global_result_path + '.csv', 'at') as csv:
+        for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
+            print('init', dataset)
+
+            # run_experiment(global_result_path, method_name, quantifier, param_grid, dataset)
+            local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe')
+
+            if os.path.exists(local_result_path):
+                print(f'result file {local_result_path} already exist; skipping')
+                report = qp.util.load_report(local_result_path)
+                for metric, table in tables.items():
+                    add_column = metric in ['tr_time', 'te_time']
+                    if not add_column:
+                        add_column = (metric=='mrae' and '-AE' not in method_name) or (metric=='mae' and '-RAE' not in method_name)
+                    if add_column:
+                        tables[metric].add(benchmark=dataset, method=method_name, v=report[metric])
+                # tables['mrae'].add(benchmark=dataset, method=method_name, v=report['mrae'])
+
+            else:
+                continue
+
+            means = report.mean(numeric_only=True)
+            csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\t{means["tr_time"]:.3f}\t{means["te_time"]:.3f}\n')
+            csv.flush()
+
+
+
+if __name__ == '__main__':
+
+    qp.environ['SAMPLE_SIZE'] = 500
+    qp.environ['N_JOBS'] = -1
+    n_bags_val = 100
+    n_bags_test = 500
+    result_dir = f'results_quantification/ucimulti'
+
+    os.makedirs(result_dir, exist_ok=True)
+
+    tables = {
+        'mae': Table('inductive-mae'),
+        'mrae': Table('inductive-mrae'),
+        'tr_time': Table('inductive-tr-time'),
+        # 'te_time': Table('inductive-te-time'),
+    }
+
+    tables['tr_time'].format.show_std = False
+    # tables['te_time'].format.show_std = False
+
+
+    global_result_path = f'{result_dir}/allmethods'
+    with open(global_result_path + '.csv', 'wt') as csv:
+        csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\tTR-TIME\tTE-TIME\n')
+
+    for method_name, _, _ in METHODS + TRANSDUCTIVE_METHODS:
+        collect_results(method_name, tables)
+
+    show_results(global_result_path, tables)
--- a/KDEy/kdey_devel.py
+++ b/KDEy/kdey_devel.py
@ -40,7 +40,7 @@ class KDEyMLauto(KDEyML):
        current_bandwidth = 0.05
        if self.optim == 'both_fine':
            current_bandwidth = np.full(fill_value=current_bandwidth, shape=(n_classes,))
-        current_prevalence = np.full(fill_value=1 / n_classes, shape=(n_classes,))
+        current_prevalence = F.uniform_prevalence(n_classes=n_classes)

        if self.optim == 'max_likelihood':
            current_prevalence, current_bandwidth = self.optim_minimize_like(tr_posteriors, tr_y, te_posteriors, classes, grid=True)
@ -107,9 +107,9 @@ class KDEyMLauto(KDEyML):

        # bounds = [(0.00001, 0.2)]
        # r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds)
-        r = optimize.minimize_scalar(neg_loglikelihood_bandwidth, bounds=(0.00001, 0.2))
+        r = optimize.minimize_scalar(neg_loglikelihood_bandwidth, bounds=(0.0001, 0.2), options={'xatol': 0.005})
        # print(f'iterations-bandwidth={r.nit}')
-        assert r.success, f'Process did not converge! {r.message}'
+        # assert r.success, f'Process did not converge! {r.message}'
        return r.x

    def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
@ -128,7 +128,7 @@ class KDEyMLauto(KDEyML):
        prevalence_bandwidth = np.append(current_prev, current_bandwidth)
        r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
        print(f'iterations-both={r.nit}')
-        assert r.success, 'Process did not converge!'
+        # assert r.success, 'Process did not converge!'
        prev_band = r.x
        current_prevalence = prev_band[:-1]
        current_bandwidth = prev_band[-1]
@ -145,12 +145,12 @@ class KDEyMLauto(KDEyML):
            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
            return -np.sum(test_loglikelihood)

-        bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1) for _ in range(n_classes)]
+        bounds = [(0, 1) for _ in range(n_classes)] + [(0.0001, 0.2) for _ in range(n_classes)]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
        prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth))
        r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
        print(f'iterations-both-fine={r.nit}')
-        assert r.success, 'Process did not converge!'
+        # assert r.success, 'Process did not converge!'
        prev_band = r.x
        current_prevalence = prev_band[:n_classes]
        current_bandwidth = prev_band[n_classes:]
@ -198,7 +198,7 @@ class KDEyMLauto(KDEyML):
        best_like = None
        best_prev = None
        init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,))
-        for bandwidth in np.logspace(-4, 0.5, 50):
+        for bandwidth in np.logspace(-4, np.log10(0.2), 50):
            mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
            test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]

@ -239,7 +239,7 @@ class KDEyMLauto(KDEyML):
        r = optimize.minimize(neglikelihood_band, x0=[0.001], method='SLSQP', bounds=bounds)

        best_band = r.x[0]
-        assert r.success, 'Process did not converge!'
+        # assert r.success, 'Process did not converge!'
        print(f'solved in nit={r.nit}')
        return best_band

@ -333,11 +333,10 @@ class KDEyMLauto2(KDEyML):
            return loss_accum

        if self.search == 'optim':
-            r = optimize.minimize_scalar(eval_bandwidth, bounds=(0.001, 0.2), options={'xatol': 0.005})
+            r = optimize.minimize_scalar(eval_bandwidth, bounds=(0.0001, 0.2), options={'xatol': 0.005})
            best_band = r.x
            best_loss_value = r.fun
            nit = r.nit
-            # assert r.success, 'Process did not converge!'

        elif self.search=='grid':
            nit=20
@ -348,20 +347,20 @@ class KDEyMLauto2(KDEyML):
        self.bandwidth_ = best_band


-class KDEyMLred(KDEyML):
-    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500):
-        self.classifier = qp._get_classifier(classifier)
-        self.val_split = val_split
-        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
-        self.reduction = reduction
-        self.max_reduced = max_reduced
-        self.random_state = random_state
-
-    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        n_classes = classif_predictions.n_classes
-        tr_length = min(self.reduction * n_classes, self.max_reduced)
-        if len(classif_predictions) > tr_length:
-            classif_predictions = classif_predictions.sampling(tr_length)
-        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
-        return self
+# class KDEyMLred(KDEyML):
+#     def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500):
+#         self.classifier = qp._get_classifier(classifier)
+#         self.val_split = val_split
+#         self.bandwidth = KDEBase._check_bandwidth(bandwidth)
+#         self.reduction = reduction
+#         self.max_reduced = max_reduced
+#         self.random_state = random_state
+#
+#     def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
+#         n_classes = classif_predictions.n_classes
+#         tr_length = min(self.reduction * n_classes, self.max_reduced)
+#         if len(classif_predictions) > tr_length:
+#             classif_predictions = classif_predictions.sampling(tr_length)
+#         self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
+#         return self

--- a/KDEy/quantification_evaluation.py
+++ b/KDEy/quantification_evaluation.py
@ -7,7 +7,7 @@ import numpy as np
 from sklearn.linear_model import LogisticRegression

 import quapy as qp
-from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2, KDEyMLred
+from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2
 from quapy.method.aggregative import PACC, EMQ, KDEyML
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
@ -32,7 +32,7 @@ def wrap_hyper(classifier_hyper_grid: dict):


 METHODS = [
-    ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
+    # ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
    ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),
    ('KDEy',  KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
    # ('KDEy-MLred',  KDEyMLred(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
@ -55,9 +55,9 @@ TKDEyML4 es como ML2 pero max 5 iteraciones por optimización
 """
 TRANSDUCTIVE_METHODS = [
    #('TKDEy-ML',  KDEyMLauto(newLR()), None),
-    # ('TKDEy-MLboth',  KDEyMLauto(newLR(), optim='both'), None),
-    # ('TKDEy-MLbothfine',  KDEyMLauto(newLR(), optim='both_fine'), None),
-    # ('TKDEy-ML2',  KDEyMLauto(newLR(), optim='two_steps'), None),
+    ('TKDEy-both',  KDEyMLauto(newLR(), optim='both'), None),
+    ('TKDEy-bothfine',  KDEyMLauto(newLR(), optim='both_fine'), None),
+    ('TKDEy-two',  KDEyMLauto(newLR(), optim='two_steps'), None),
    # ('TKDEy-MLike',  KDEyMLauto(newLR(), optim='max_likelihood'), None),
    # ('TKDEy-MLike2',  KDEyMLauto(newLR(), optim='max_likelihood2'), None),
    #('TKDEy-ML3',  KDEyMLauto(newLR()), None),
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit c223c9f1fe3c9708e8c5a5c56e438cdaaa857be4
+Subproject commit 52547b253e906b8ae8d5ae3df77dafe72fac6902