code used to generate plots

2023-11-13 12:07:59 +01:00 · 2023-11-13 12:07:59 +01:00 · bb0950fad5
parent 2e992a0b9a
commit bb0950fad5
6 changed files with 326 additions and 6 deletions
--- a/CACM2023_plots/plots_CACM2023_3histograms.py
+++ b/CACM2023_plots/plots_CACM2023_3histograms.py
@ -0,0 +1,73 @@
+import itertools
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+
+palette = itertools.cycle(sns.color_palette())
+
+def setframe():
+    fig.spines['top'].set_visible(False)
+    fig.spines['left'].set_visible(False)
+    fig.get_yaxis().set_ticks([])
+    fig.spines['right'].set_visible(False)
+    # fig.axis('off')
+
+nbins = 50
+figsize = (5, 2)
+ymax = 0.2
+
+negatives = np.random.normal(loc = 0.3, scale=0.2, size=20000)
+negatives = np.asarray([x for x in negatives if 0 <= x <= 1])
+
+plt.figure(figsize=figsize)
+plt.xlim(0, 1)
+plt.ylim(0, ymax)
+fig = sns.histplot(data=negatives, binrange=(0,1), bins=nbins,  stat='probability', color=next(palette))
+plt.title('Negative distribution')
+fig.set(yticklabels=[])
+fig.set(ylabel=None)
+setframe()
+# fig.get_figure().savefig('plots_cacm/negatives.pdf')
+# plt.clf()
+
+# -------------------------------------------------------------
+
+positives1 = np.random.normal(loc = 0.75, scale=0.06, size=20000)
+positives2 = np.random.normal(loc = 0.65, scale=0.1, size=1)
+positives = np.concatenate([positives1, positives2])
+np.random.shuffle(positives)
+positives = np.asarray([x for x in positives if 0 <= x <= 1])
+
+# plt.figure(figsize=figsize)
+plt.xlim(0, 1)
+plt.ylim(0, ymax)
+fig = sns.histplot(data=positives, binrange=(0,1), bins=nbins,  stat='probability', color=next(palette))
+plt.title('')
+fig.set(yticklabels=[])
+fig.set(ylabel=None)
+setframe()
+fig.get_figure().savefig('plots_cacm/training.pdf')
+
+# -------------------------------------------------------------
+
+prev = 0.2
+test = np.concatenate([
+    negatives[:int(len(negatives)*(1-prev))],
+    positives[:int(len(positives)*(prev))],
+])
+
+
+plt.figure(figsize=figsize)
+plt.xlim(0, 1)
+plt.ylim(0, ymax)
+fig = sns.histplot(data=test, binrange=(0,1), bins=nbins,  stat='probability', color=next(palette))
+plt.title('')
+fig.set(yticklabels=[])
+fig.set(ylabel=None)
+setframe()
+fig.get_figure().savefig('plots_cacm/test.pdf')
+
+
--- a/CACM2023_plots/plots_CACM2023_errdrift_deprecated.py
+++ b/CACM2023_plots/plots_CACM2023_errdrift_deprecated.py
@ -0,0 +1,86 @@
+from copy import deepcopy
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+from method.non_aggregative import DMx
+from protocol import APP
+from quapy.method.aggregative import CC, ACC, DMy
+from sklearn.svm import LinearSVC
+
+qp.environ['SAMPLE_SIZE'] = 100
+DATASETS = qp.datasets.UCI_DATASETS[10:]
+
+def fit_eval_task(args):
+    model_name, model, train, test = args
+    with qp.util.temp_seed(0):
+        model = deepcopy(model)
+        model.fit(train)
+        true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
+    return model_name, true_prev, estim_prev
+
+
+def gen_data():
+
+    def base_classifier():
+        return LogisticRegression()
+        #return LinearSVC(class_weight='balanced')
+
+
+    def models():
+        yield 'CC', CC(base_classifier())
+        yield 'ACC', ACC(base_classifier())
+        yield 'HDy', DMy(base_classifier(), val_split=10, nbins=10, n_jobs=-1)
+        yield 'HDx', DMx(nbins=10, n_jobs=-1)
+
+    # train, test = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=10).train_test
+    method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
+
+    for dataset_name in DATASETS:
+        train, test = qp.datasets.fetch_UCIDataset(dataset_name).train_test
+        print(dataset_name, train.X.shape)
+
+        outs = qp.util.parallel(
+            fit_eval_task,
+            ((method_name, model, train, test) for method_name, model in models()),
+            seed=0,
+            n_jobs=-1
+        )
+
+        for method_name, true_prev, estim_prev in outs:
+            method_names.append(method_name)
+            true_prevs.append(true_prev)
+            estim_prevs.append(estim_prev)
+            tr_prevs.append(train.prevalence())
+
+    return method_names, true_prevs, estim_prevs, tr_prevs
+
+method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('../quick_experiment/pickled_plot_data.pkl', gen_data)
+
+def remove_dataset(dataset_order, num_methods=4):
+    sel_names, sel_true, sel_estim, sel_tr = [],[],[],[]
+    for i, (name, true, estim, tr) in enumerate(zip(method_names, true_prevs, estim_prevs, tr_prevs)):
+        dataset_pos = i//num_methods
+        if dataset_pos not in dataset_order:
+            sel_names.append(name)
+            sel_true.append(true)
+            sel_estim.append(estim)
+            sel_tr.append(tr)
+    return np.asarray(sel_names), np.asarray(sel_true), np.asarray(sel_estim), np.asarray(sel_tr)
+
+print(DATASETS)
+selected = 10
+for i in [selected]:
+    print(i, DATASETS[i])
+    all_ = set(range(len(DATASETS)))
+    remove_index = sorted(all_ - {i})
+    sel_names, sel_true, sel_estim, sel_tr = remove_dataset(dataset_order=remove_index, num_methods=4)
+
+    p=sel_tr[0][1]
+    sel_names = ['CC$_{'+str(p)+'}$' if x=='CC' else x for x in sel_names]
+
+    # qp.plot.binary_diagonal(sel_names, sel_true, sel_estim, train_prev=sel_tr[0], show_std=False, savepath=f'./plots/bin_diag_{i}.png')
+    qp.plot.error_by_drift(sel_names, sel_true, sel_estim, sel_tr, n_bins=10, savepath=f'./plots/err_drift_{i}.png', show_std=True, show_density=False, title="")
+    # qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png')
+    # qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, nbins=3, savepath='./plots/bin_bias_bin.png')
--- a/CACM2023_plots/plots_CACM2023_histogram3D.py
+++ b/CACM2023_plots/plots_CACM2023_histogram3D.py
@ -0,0 +1,62 @@
+
+import math
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split, cross_val_predict
+from sklearn.neighbors import KernelDensity
+import matplotlib.pyplot as plt
+import numpy as np
+
+from data import LabelledCollection
+
+scale = 100
+
+
+import quapy as qp
+
+negatives = np.random.normal(loc = 0.2, scale=0.2, size=20000)
+negatives = np.asarray([x for x in negatives if 0 <= x <= 1])
+
+positives = np.random.normal(loc = 0.75, scale=0.05, size=20000)
+positives = np.asarray([x for x in positives if 0 <= x <= 1])
+
+prev = 0.1
+test = np.concatenate([
+    negatives[:int(len(negatives)*(1-prev))],
+    positives[:int(len(positives)*(prev))],
+])
+
+
+nbins = 30
+
+plt.rcParams.update({'font.size': 7})
+
+fig = plt.figure()
+positions = np.asarray([2,1,0])
+colors = ['r', 'g', 'b']
+
+
+ax = fig.add_subplot(111, projection='3d')
+ax.set_box_aspect((3, 1, 0.8))
+
+for post, c, z in zip([test, positives, negatives], colors, positions):
+
+    hist, bins = np.histogram(post, bins=np.linspace(0,1, nbins+1), density=True)
+    xs = (bins[:-1] + bins[1:])/2
+
+    ax.bar(xs, hist, width=1 / nbins, zs=z, zdir='y', color=c, ec=c, alpha=0.6)
+
+
+ax.yaxis.set_ticks(positions)
+ax.yaxis.set_ticklabels([' '*20+'Test distribution', ' '*20+'Positive distribution', ' '*20+'Negative distribution'])
+# ax.xaxis.set_ticks([])
+# ax.xaxis.set_ticklabels([], minor=True)
+ax.zaxis.set_ticks([])
+ax.zaxis.set_ticklabels([], minor=True)
+
+
+#plt.figure(figsize=(10,6))
+#plt.show()
+plt.savefig('./histograms3d_CACM2023.pdf')
+
+
--- a/CACM2023_plots/plotting_diagonal_4methods.py
+++ b/CACM2023_plots/plotting_diagonal_4methods.py
@ -0,0 +1,59 @@
+from sklearn.decomposition import TruncatedSVD
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.model_selection import GridSearchCV
+
+import quapy as qp
+from data import LabelledCollection
+from method.non_aggregative import DMx
+from protocol import APP
+from quapy.method.aggregative import CC, DMy, ACC
+from sklearn.svm import LinearSVC
+import numpy as np
+from tqdm import tqdm
+
+qp.environ['SAMPLE_SIZE'] = 500
+
+def cls():
+    return LogisticRegressionCV(n_jobs=-1,Cs=10)
+
+def gen_methods():
+    yield CC(cls()), 'CC$_{10' + '\%}$'
+    yield ACC(cls()), 'ACC'
+    yield DMy(cls(), val_split=10, nbins=10, n_jobs=-1), 'HDy'
+    yield DMx(nbins=10, n_jobs=-1), 'HDx'
+
+def gen_data():
+
+    train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
+
+    method_data = []
+    training_prevalence = 0.1
+    training_size = 5000
+    # since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
+    train_sample = train.sampling(training_size, 1-training_prevalence, random_state=0)
+
+    for model, method_name in tqdm(gen_methods(), total=4):
+        with qp.util.temp_seed(1):
+            if method_name == 'HDx':
+                X, y = train_sample.Xy
+                svd = TruncatedSVD(n_components=5, random_state=0)
+                Xred = svd.fit_transform(X)
+                train_sample_dense = LabelledCollection(Xred, y)
+
+                X, y = test.Xy
+                test_dense = LabelledCollection(svd.transform(X), y)
+
+                model.fit(train_sample_dense)
+                true_prev, estim_prev = qp.evaluation.prediction(model, APP(test_dense, repeats=100, random_state=0))
+            else:
+                model.fit(train_sample)
+                true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
+        method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
+
+    return zip(*method_data)
+
+
+method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
+
+qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_4methods.pdf')
+qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_4methods.pdf', title='', show_density=False, show_std=True)
--- a/CACM2023_plots/plotting_diagonal_CCvariants.py
+++ b/CACM2023_plots/plotting_diagonal_CCvariants.py
@ -0,0 +1,40 @@
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.model_selection import GridSearchCV
+
+import quapy as qp
+from protocol import APP
+from quapy.method.aggregative import CC
+from sklearn.svm import LinearSVC
+import numpy as np
+from tqdm import tqdm
+
+qp.environ['SAMPLE_SIZE'] = 500
+
+def gen_data():
+
+    train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
+
+    method_data = []
+    for training_prevalence in tqdm(np.linspace(0.1, 0.9, 9), total=9):
+        training_size = 5000
+        # since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
+        train_sample = train.sampling(training_size, 1-training_prevalence)
+
+        # cls = GridSearchCV(LinearSVC(), param_grid={'C': np.logspace(-2,2,5), 'class_weight':[None, 'balanced']}, n_jobs=-1)
+        # cls = GridSearchCV(LogisticRegression(), param_grid={'C': np.logspace(-2, 2, 5), 'class_weight': [None, 'balanced']}, n_jobs=-1)
+        # cls.fit(*train_sample.Xy)
+
+        model = CC(LogisticRegressionCV(n_jobs=-1,Cs=10))
+
+        model.fit(train_sample)
+        true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
+        method_name = 'CC$_{'+f'{int(100*training_prevalence)}' + '\%}$'
+        method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
+
+    return zip(*method_data)
+
+
+method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
+
+qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_cc.pdf')
+# qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_cc.pdf', title='', show_density=False)
--- a/quapy/plot.py
+++ b/quapy/plot.py
@ -72,7 +72,7 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
        train_prev = train_prev[pos_class]
        ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3)

-    ax.set(xlabel='true prevalence', ylabel='estimated prevalence', title=title)
+    ax.set(xlabel='true frequency', ylabel='estimated frequency', title=title)
    ax.set_ylim(0, 1)
    ax.set_xlim(0, 1)

@ -219,7 +219,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
                   title=f'Quantification error as a function of label shift',
                   vlines=None,
                   method_order=None,
-                   fontsize=12,
+                   fontsize=18,
                   savepath=None):
    """
    Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift
@ -294,8 +294,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
        ys = np.asarray(ys)
        ystds = np.asarray(ystds)

-        if ys[-1]<ys[-2]:
-            ys[-1] = ys[-2]+(abs(ys[-2]-ys[-3]))/2
+        # if ys[-1]<ys[-2]:
+        #     ys[-1] = ys[-2]+(abs(ys[-2]-ys[-3]))/2

        min_x_method, max_x_method, min_y_method, max_y_method = xs.min(), xs.max(), ys.min(), ys.max()
        min_x = min_x_method if min_x is None or min_x_method < min_x else min_x
@ -308,7 +308,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
        ax.errorbar(xs, ys, fmt='-', marker='o', label=method, markersize=6, linewidth=2, zorder=2)

        if show_std:
-            ax.fill_between(xs, ys-ystds/3, ys+ystds/3, alpha=0.25)
+            ax.fill_between(xs, ys-ystds, ys+ystds, alpha=0.25)

    if show_density:
        ax2 = ax.twinx()
@ -335,7 +335,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
    
    
    if show_legend:
-        ax.legend(loc='center right', bbox_to_anchor=(1.2, 0.5))
+        ax.legend(loc='center right', bbox_to_anchor=(1.31, 0.5))
        # fig.legend(loc='lower center',
        #           bbox_to_anchor=(1, 0.5),
        #           ncol=(len(method_names)+1)//2)