diff --git a/CACM2023_plots/plots_CACM2023_3histograms.py b/CACM2023_plots/plots_CACM2023_3histograms.py new file mode 100644 index 0000000..118f43d --- /dev/null +++ b/CACM2023_plots/plots_CACM2023_3histograms.py @@ -0,0 +1,73 @@ +import itertools + +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np + + + +palette = itertools.cycle(sns.color_palette()) + +def setframe(): + fig.spines['top'].set_visible(False) + fig.spines['left'].set_visible(False) + fig.get_yaxis().set_ticks([]) + fig.spines['right'].set_visible(False) + # fig.axis('off') + +nbins = 50 +figsize = (5, 2) +ymax = 0.2 + +negatives = np.random.normal(loc = 0.3, scale=0.2, size=20000) +negatives = np.asarray([x for x in negatives if 0 <= x <= 1]) + +plt.figure(figsize=figsize) +plt.xlim(0, 1) +plt.ylim(0, ymax) +fig = sns.histplot(data=negatives, binrange=(0,1), bins=nbins, stat='probability', color=next(palette)) +plt.title('Negative distribution') +fig.set(yticklabels=[]) +fig.set(ylabel=None) +setframe() +# fig.get_figure().savefig('plots_cacm/negatives.pdf') +# plt.clf() + +# ------------------------------------------------------------- + +positives1 = np.random.normal(loc = 0.75, scale=0.06, size=20000) +positives2 = np.random.normal(loc = 0.65, scale=0.1, size=1) +positives = np.concatenate([positives1, positives2]) +np.random.shuffle(positives) +positives = np.asarray([x for x in positives if 0 <= x <= 1]) + +# plt.figure(figsize=figsize) +plt.xlim(0, 1) +plt.ylim(0, ymax) +fig = sns.histplot(data=positives, binrange=(0,1), bins=nbins, stat='probability', color=next(palette)) +plt.title('') +fig.set(yticklabels=[]) +fig.set(ylabel=None) +setframe() +fig.get_figure().savefig('plots_cacm/training.pdf') + +# ------------------------------------------------------------- + +prev = 0.2 +test = np.concatenate([ + negatives[:int(len(negatives)*(1-prev))], + positives[:int(len(positives)*(prev))], +]) + + +plt.figure(figsize=figsize) +plt.xlim(0, 1) +plt.ylim(0, ymax) +fig = sns.histplot(data=test, binrange=(0,1), bins=nbins, stat='probability', color=next(palette)) +plt.title('') +fig.set(yticklabels=[]) +fig.set(ylabel=None) +setframe() +fig.get_figure().savefig('plots_cacm/test.pdf') + + diff --git a/CACM2023_plots/plots_CACM2023_errdrift_deprecated.py b/CACM2023_plots/plots_CACM2023_errdrift_deprecated.py new file mode 100644 index 0000000..3d3bf8a --- /dev/null +++ b/CACM2023_plots/plots_CACM2023_errdrift_deprecated.py @@ -0,0 +1,86 @@ +from copy import deepcopy + +import numpy as np +from sklearn.linear_model import LogisticRegression + +import quapy as qp +from method.non_aggregative import DMx +from protocol import APP +from quapy.method.aggregative import CC, ACC, DMy +from sklearn.svm import LinearSVC + +qp.environ['SAMPLE_SIZE'] = 100 +DATASETS = qp.datasets.UCI_DATASETS[10:] + +def fit_eval_task(args): + model_name, model, train, test = args + with qp.util.temp_seed(0): + model = deepcopy(model) + model.fit(train) + true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0)) + return model_name, true_prev, estim_prev + + +def gen_data(): + + def base_classifier(): + return LogisticRegression() + #return LinearSVC(class_weight='balanced') + + + def models(): + yield 'CC', CC(base_classifier()) + yield 'ACC', ACC(base_classifier()) + yield 'HDy', DMy(base_classifier(), val_split=10, nbins=10, n_jobs=-1) + yield 'HDx', DMx(nbins=10, n_jobs=-1) + + # train, test = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=10).train_test + method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], [] + + for dataset_name in DATASETS: + train, test = qp.datasets.fetch_UCIDataset(dataset_name).train_test + print(dataset_name, train.X.shape) + + outs = qp.util.parallel( + fit_eval_task, + ((method_name, model, train, test) for method_name, model in models()), + seed=0, + n_jobs=-1 + ) + + for method_name, true_prev, estim_prev in outs: + method_names.append(method_name) + true_prevs.append(true_prev) + estim_prevs.append(estim_prev) + tr_prevs.append(train.prevalence()) + + return method_names, true_prevs, estim_prevs, tr_prevs + +method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('../quick_experiment/pickled_plot_data.pkl', gen_data) + +def remove_dataset(dataset_order, num_methods=4): + sel_names, sel_true, sel_estim, sel_tr = [],[],[],[] + for i, (name, true, estim, tr) in enumerate(zip(method_names, true_prevs, estim_prevs, tr_prevs)): + dataset_pos = i//num_methods + if dataset_pos not in dataset_order: + sel_names.append(name) + sel_true.append(true) + sel_estim.append(estim) + sel_tr.append(tr) + return np.asarray(sel_names), np.asarray(sel_true), np.asarray(sel_estim), np.asarray(sel_tr) + +print(DATASETS) +selected = 10 +for i in [selected]: + print(i, DATASETS[i]) + all_ = set(range(len(DATASETS))) + remove_index = sorted(all_ - {i}) + sel_names, sel_true, sel_estim, sel_tr = remove_dataset(dataset_order=remove_index, num_methods=4) + + p=sel_tr[0][1] + sel_names = ['CC$_{'+str(p)+'}$' if x=='CC' else x for x in sel_names] + + # qp.plot.binary_diagonal(sel_names, sel_true, sel_estim, train_prev=sel_tr[0], show_std=False, savepath=f'./plots/bin_diag_{i}.png') + qp.plot.error_by_drift(sel_names, sel_true, sel_estim, sel_tr, n_bins=10, savepath=f'./plots/err_drift_{i}.png', show_std=True, show_density=False, title="") + # qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png') + # qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, nbins=3, savepath='./plots/bin_bias_bin.png') diff --git a/CACM2023_plots/plots_CACM2023_histogram3D.py b/CACM2023_plots/plots_CACM2023_histogram3D.py new file mode 100644 index 0000000..08dcdba --- /dev/null +++ b/CACM2023_plots/plots_CACM2023_histogram3D.py @@ -0,0 +1,62 @@ + +import math +import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split, cross_val_predict +from sklearn.neighbors import KernelDensity +import matplotlib.pyplot as plt +import numpy as np + +from data import LabelledCollection + +scale = 100 + + +import quapy as qp + +negatives = np.random.normal(loc = 0.2, scale=0.2, size=20000) +negatives = np.asarray([x for x in negatives if 0 <= x <= 1]) + +positives = np.random.normal(loc = 0.75, scale=0.05, size=20000) +positives = np.asarray([x for x in positives if 0 <= x <= 1]) + +prev = 0.1 +test = np.concatenate([ + negatives[:int(len(negatives)*(1-prev))], + positives[:int(len(positives)*(prev))], +]) + + +nbins = 30 + +plt.rcParams.update({'font.size': 7}) + +fig = plt.figure() +positions = np.asarray([2,1,0]) +colors = ['r', 'g', 'b'] + + +ax = fig.add_subplot(111, projection='3d') +ax.set_box_aspect((3, 1, 0.8)) + +for post, c, z in zip([test, positives, negatives], colors, positions): + + hist, bins = np.histogram(post, bins=np.linspace(0,1, nbins+1), density=True) + xs = (bins[:-1] + bins[1:])/2 + + ax.bar(xs, hist, width=1 / nbins, zs=z, zdir='y', color=c, ec=c, alpha=0.6) + + +ax.yaxis.set_ticks(positions) +ax.yaxis.set_ticklabels([' '*20+'Test distribution', ' '*20+'Positive distribution', ' '*20+'Negative distribution']) +# ax.xaxis.set_ticks([]) +# ax.xaxis.set_ticklabels([], minor=True) +ax.zaxis.set_ticks([]) +ax.zaxis.set_ticklabels([], minor=True) + + +#plt.figure(figsize=(10,6)) +#plt.show() +plt.savefig('./histograms3d_CACM2023.pdf') + + diff --git a/CACM2023_plots/plotting_diagonal_4methods.py b/CACM2023_plots/plotting_diagonal_4methods.py new file mode 100644 index 0000000..488e700 --- /dev/null +++ b/CACM2023_plots/plotting_diagonal_4methods.py @@ -0,0 +1,59 @@ +from sklearn.decomposition import TruncatedSVD +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV +from sklearn.model_selection import GridSearchCV + +import quapy as qp +from data import LabelledCollection +from method.non_aggregative import DMx +from protocol import APP +from quapy.method.aggregative import CC, DMy, ACC +from sklearn.svm import LinearSVC +import numpy as np +from tqdm import tqdm + +qp.environ['SAMPLE_SIZE'] = 500 + +def cls(): + return LogisticRegressionCV(n_jobs=-1,Cs=10) + +def gen_methods(): + yield CC(cls()), 'CC$_{10' + '\%}$' + yield ACC(cls()), 'ACC' + yield DMy(cls(), val_split=10, nbins=10, n_jobs=-1), 'HDy' + yield DMx(nbins=10, n_jobs=-1), 'HDx' + +def gen_data(): + + train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test + + method_data = [] + training_prevalence = 0.1 + training_size = 5000 + # since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained + train_sample = train.sampling(training_size, 1-training_prevalence, random_state=0) + + for model, method_name in tqdm(gen_methods(), total=4): + with qp.util.temp_seed(1): + if method_name == 'HDx': + X, y = train_sample.Xy + svd = TruncatedSVD(n_components=5, random_state=0) + Xred = svd.fit_transform(X) + train_sample_dense = LabelledCollection(Xred, y) + + X, y = test.Xy + test_dense = LabelledCollection(svd.transform(X), y) + + model.fit(train_sample_dense) + true_prev, estim_prev = qp.evaluation.prediction(model, APP(test_dense, repeats=100, random_state=0)) + else: + model.fit(train_sample) + true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0)) + method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence())) + + return zip(*method_data) + + +method_names, true_prevs, estim_prevs, tr_prevs = gen_data() + +qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_4methods.pdf') +qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_4methods.pdf', title='', show_density=False, show_std=True) diff --git a/CACM2023_plots/plotting_diagonal_CCvariants.py b/CACM2023_plots/plotting_diagonal_CCvariants.py new file mode 100644 index 0000000..e6dd290 --- /dev/null +++ b/CACM2023_plots/plotting_diagonal_CCvariants.py @@ -0,0 +1,40 @@ +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV +from sklearn.model_selection import GridSearchCV + +import quapy as qp +from protocol import APP +from quapy.method.aggregative import CC +from sklearn.svm import LinearSVC +import numpy as np +from tqdm import tqdm + +qp.environ['SAMPLE_SIZE'] = 500 + +def gen_data(): + + train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test + + method_data = [] + for training_prevalence in tqdm(np.linspace(0.1, 0.9, 9), total=9): + training_size = 5000 + # since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained + train_sample = train.sampling(training_size, 1-training_prevalence) + + # cls = GridSearchCV(LinearSVC(), param_grid={'C': np.logspace(-2,2,5), 'class_weight':[None, 'balanced']}, n_jobs=-1) + # cls = GridSearchCV(LogisticRegression(), param_grid={'C': np.logspace(-2, 2, 5), 'class_weight': [None, 'balanced']}, n_jobs=-1) + # cls.fit(*train_sample.Xy) + + model = CC(LogisticRegressionCV(n_jobs=-1,Cs=10)) + + model.fit(train_sample) + true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0)) + method_name = 'CC$_{'+f'{int(100*training_prevalence)}' + '\%}$' + method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence())) + + return zip(*method_data) + + +method_names, true_prevs, estim_prevs, tr_prevs = gen_data() + +qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_cc.pdf') +# qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_cc.pdf', title='', show_density=False) diff --git a/quapy/plot.py b/quapy/plot.py index 606a07a..7807f26 100644 --- a/quapy/plot.py +++ b/quapy/plot.py @@ -72,7 +72,7 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No train_prev = train_prev[pos_class] ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3) - ax.set(xlabel='true prevalence', ylabel='estimated prevalence', title=title) + ax.set(xlabel='true frequency', ylabel='estimated frequency', title=title) ax.set_ylim(0, 1) ax.set_xlim(0, 1) @@ -219,7 +219,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, title=f'Quantification error as a function of label shift', vlines=None, method_order=None, - fontsize=12, + fontsize=18, savepath=None): """ Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift @@ -294,8 +294,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, ys = np.asarray(ys) ystds = np.asarray(ystds) - if ys[-1]