Compare commits
2 Commits
Author | SHA1 | Date |
---|---|---|
Alejandro Moreo Fernandez | bb0950fad5 | |
Alejandro Moreo Fernandez | 2e992a0b9a |
|
@ -0,0 +1,73 @@
|
|||
import itertools
|
||||
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
|
||||
|
||||
palette = itertools.cycle(sns.color_palette())
|
||||
|
||||
def setframe():
|
||||
fig.spines['top'].set_visible(False)
|
||||
fig.spines['left'].set_visible(False)
|
||||
fig.get_yaxis().set_ticks([])
|
||||
fig.spines['right'].set_visible(False)
|
||||
# fig.axis('off')
|
||||
|
||||
nbins = 50
|
||||
figsize = (5, 2)
|
||||
ymax = 0.2
|
||||
|
||||
negatives = np.random.normal(loc = 0.3, scale=0.2, size=20000)
|
||||
negatives = np.asarray([x for x in negatives if 0 <= x <= 1])
|
||||
|
||||
plt.figure(figsize=figsize)
|
||||
plt.xlim(0, 1)
|
||||
plt.ylim(0, ymax)
|
||||
fig = sns.histplot(data=negatives, binrange=(0,1), bins=nbins, stat='probability', color=next(palette))
|
||||
plt.title('Negative distribution')
|
||||
fig.set(yticklabels=[])
|
||||
fig.set(ylabel=None)
|
||||
setframe()
|
||||
# fig.get_figure().savefig('plots_cacm/negatives.pdf')
|
||||
# plt.clf()
|
||||
|
||||
# -------------------------------------------------------------
|
||||
|
||||
positives1 = np.random.normal(loc = 0.75, scale=0.06, size=20000)
|
||||
positives2 = np.random.normal(loc = 0.65, scale=0.1, size=1)
|
||||
positives = np.concatenate([positives1, positives2])
|
||||
np.random.shuffle(positives)
|
||||
positives = np.asarray([x for x in positives if 0 <= x <= 1])
|
||||
|
||||
# plt.figure(figsize=figsize)
|
||||
plt.xlim(0, 1)
|
||||
plt.ylim(0, ymax)
|
||||
fig = sns.histplot(data=positives, binrange=(0,1), bins=nbins, stat='probability', color=next(palette))
|
||||
plt.title('')
|
||||
fig.set(yticklabels=[])
|
||||
fig.set(ylabel=None)
|
||||
setframe()
|
||||
fig.get_figure().savefig('plots_cacm/training.pdf')
|
||||
|
||||
# -------------------------------------------------------------
|
||||
|
||||
prev = 0.2
|
||||
test = np.concatenate([
|
||||
negatives[:int(len(negatives)*(1-prev))],
|
||||
positives[:int(len(positives)*(prev))],
|
||||
])
|
||||
|
||||
|
||||
plt.figure(figsize=figsize)
|
||||
plt.xlim(0, 1)
|
||||
plt.ylim(0, ymax)
|
||||
fig = sns.histplot(data=test, binrange=(0,1), bins=nbins, stat='probability', color=next(palette))
|
||||
plt.title('')
|
||||
fig.set(yticklabels=[])
|
||||
fig.set(ylabel=None)
|
||||
setframe()
|
||||
fig.get_figure().savefig('plots_cacm/test.pdf')
|
||||
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
from copy import deepcopy
|
||||
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
import quapy as qp
|
||||
from method.non_aggregative import DMx
|
||||
from protocol import APP
|
||||
from quapy.method.aggregative import CC, ACC, DMy
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 100
|
||||
DATASETS = qp.datasets.UCI_DATASETS[10:]
|
||||
|
||||
def fit_eval_task(args):
|
||||
model_name, model, train, test = args
|
||||
with qp.util.temp_seed(0):
|
||||
model = deepcopy(model)
|
||||
model.fit(train)
|
||||
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
|
||||
return model_name, true_prev, estim_prev
|
||||
|
||||
|
||||
def gen_data():
|
||||
|
||||
def base_classifier():
|
||||
return LogisticRegression()
|
||||
#return LinearSVC(class_weight='balanced')
|
||||
|
||||
|
||||
def models():
|
||||
yield 'CC', CC(base_classifier())
|
||||
yield 'ACC', ACC(base_classifier())
|
||||
yield 'HDy', DMy(base_classifier(), val_split=10, nbins=10, n_jobs=-1)
|
||||
yield 'HDx', DMx(nbins=10, n_jobs=-1)
|
||||
|
||||
# train, test = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=10).train_test
|
||||
method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
|
||||
|
||||
for dataset_name in DATASETS:
|
||||
train, test = qp.datasets.fetch_UCIDataset(dataset_name).train_test
|
||||
print(dataset_name, train.X.shape)
|
||||
|
||||
outs = qp.util.parallel(
|
||||
fit_eval_task,
|
||||
((method_name, model, train, test) for method_name, model in models()),
|
||||
seed=0,
|
||||
n_jobs=-1
|
||||
)
|
||||
|
||||
for method_name, true_prev, estim_prev in outs:
|
||||
method_names.append(method_name)
|
||||
true_prevs.append(true_prev)
|
||||
estim_prevs.append(estim_prev)
|
||||
tr_prevs.append(train.prevalence())
|
||||
|
||||
return method_names, true_prevs, estim_prevs, tr_prevs
|
||||
|
||||
method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('../quick_experiment/pickled_plot_data.pkl', gen_data)
|
||||
|
||||
def remove_dataset(dataset_order, num_methods=4):
|
||||
sel_names, sel_true, sel_estim, sel_tr = [],[],[],[]
|
||||
for i, (name, true, estim, tr) in enumerate(zip(method_names, true_prevs, estim_prevs, tr_prevs)):
|
||||
dataset_pos = i//num_methods
|
||||
if dataset_pos not in dataset_order:
|
||||
sel_names.append(name)
|
||||
sel_true.append(true)
|
||||
sel_estim.append(estim)
|
||||
sel_tr.append(tr)
|
||||
return np.asarray(sel_names), np.asarray(sel_true), np.asarray(sel_estim), np.asarray(sel_tr)
|
||||
|
||||
print(DATASETS)
|
||||
selected = 10
|
||||
for i in [selected]:
|
||||
print(i, DATASETS[i])
|
||||
all_ = set(range(len(DATASETS)))
|
||||
remove_index = sorted(all_ - {i})
|
||||
sel_names, sel_true, sel_estim, sel_tr = remove_dataset(dataset_order=remove_index, num_methods=4)
|
||||
|
||||
p=sel_tr[0][1]
|
||||
sel_names = ['CC$_{'+str(p)+'}$' if x=='CC' else x for x in sel_names]
|
||||
|
||||
# qp.plot.binary_diagonal(sel_names, sel_true, sel_estim, train_prev=sel_tr[0], show_std=False, savepath=f'./plots/bin_diag_{i}.png')
|
||||
qp.plot.error_by_drift(sel_names, sel_true, sel_estim, sel_tr, n_bins=10, savepath=f'./plots/err_drift_{i}.png', show_std=True, show_density=False, title="")
|
||||
# qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png')
|
||||
# qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, nbins=3, savepath='./plots/bin_bias_bin.png')
|
|
@ -0,0 +1,62 @@
|
|||
|
||||
import math
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import train_test_split, cross_val_predict
|
||||
from sklearn.neighbors import KernelDensity
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
from data import LabelledCollection
|
||||
|
||||
scale = 100
|
||||
|
||||
|
||||
import quapy as qp
|
||||
|
||||
negatives = np.random.normal(loc = 0.2, scale=0.2, size=20000)
|
||||
negatives = np.asarray([x for x in negatives if 0 <= x <= 1])
|
||||
|
||||
positives = np.random.normal(loc = 0.75, scale=0.05, size=20000)
|
||||
positives = np.asarray([x for x in positives if 0 <= x <= 1])
|
||||
|
||||
prev = 0.1
|
||||
test = np.concatenate([
|
||||
negatives[:int(len(negatives)*(1-prev))],
|
||||
positives[:int(len(positives)*(prev))],
|
||||
])
|
||||
|
||||
|
||||
nbins = 30
|
||||
|
||||
plt.rcParams.update({'font.size': 7})
|
||||
|
||||
fig = plt.figure()
|
||||
positions = np.asarray([2,1,0])
|
||||
colors = ['r', 'g', 'b']
|
||||
|
||||
|
||||
ax = fig.add_subplot(111, projection='3d')
|
||||
ax.set_box_aspect((3, 1, 0.8))
|
||||
|
||||
for post, c, z in zip([test, positives, negatives], colors, positions):
|
||||
|
||||
hist, bins = np.histogram(post, bins=np.linspace(0,1, nbins+1), density=True)
|
||||
xs = (bins[:-1] + bins[1:])/2
|
||||
|
||||
ax.bar(xs, hist, width=1 / nbins, zs=z, zdir='y', color=c, ec=c, alpha=0.6)
|
||||
|
||||
|
||||
ax.yaxis.set_ticks(positions)
|
||||
ax.yaxis.set_ticklabels([' '*20+'Test distribution', ' '*20+'Positive distribution', ' '*20+'Negative distribution'])
|
||||
# ax.xaxis.set_ticks([])
|
||||
# ax.xaxis.set_ticklabels([], minor=True)
|
||||
ax.zaxis.set_ticks([])
|
||||
ax.zaxis.set_ticklabels([], minor=True)
|
||||
|
||||
|
||||
#plt.figure(figsize=(10,6))
|
||||
#plt.show()
|
||||
plt.savefig('./histograms3d_CACM2023.pdf')
|
||||
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
import quapy as qp
|
||||
from data import LabelledCollection
|
||||
from method.non_aggregative import DMx
|
||||
from protocol import APP
|
||||
from quapy.method.aggregative import CC, DMy, ACC
|
||||
from sklearn.svm import LinearSVC
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 500
|
||||
|
||||
def cls():
|
||||
return LogisticRegressionCV(n_jobs=-1,Cs=10)
|
||||
|
||||
def gen_methods():
|
||||
yield CC(cls()), 'CC$_{10' + '\%}$'
|
||||
yield ACC(cls()), 'ACC'
|
||||
yield DMy(cls(), val_split=10, nbins=10, n_jobs=-1), 'HDy'
|
||||
yield DMx(nbins=10, n_jobs=-1), 'HDx'
|
||||
|
||||
def gen_data():
|
||||
|
||||
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
|
||||
|
||||
method_data = []
|
||||
training_prevalence = 0.1
|
||||
training_size = 5000
|
||||
# since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
|
||||
train_sample = train.sampling(training_size, 1-training_prevalence, random_state=0)
|
||||
|
||||
for model, method_name in tqdm(gen_methods(), total=4):
|
||||
with qp.util.temp_seed(1):
|
||||
if method_name == 'HDx':
|
||||
X, y = train_sample.Xy
|
||||
svd = TruncatedSVD(n_components=5, random_state=0)
|
||||
Xred = svd.fit_transform(X)
|
||||
train_sample_dense = LabelledCollection(Xred, y)
|
||||
|
||||
X, y = test.Xy
|
||||
test_dense = LabelledCollection(svd.transform(X), y)
|
||||
|
||||
model.fit(train_sample_dense)
|
||||
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test_dense, repeats=100, random_state=0))
|
||||
else:
|
||||
model.fit(train_sample)
|
||||
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
|
||||
method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
|
||||
|
||||
return zip(*method_data)
|
||||
|
||||
|
||||
method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
|
||||
|
||||
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_4methods.pdf')
|
||||
qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_4methods.pdf', title='', show_density=False, show_std=True)
|
|
@ -0,0 +1,40 @@
|
|||
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
import quapy as qp
|
||||
from protocol import APP
|
||||
from quapy.method.aggregative import CC
|
||||
from sklearn.svm import LinearSVC
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 500
|
||||
|
||||
def gen_data():
|
||||
|
||||
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
|
||||
|
||||
method_data = []
|
||||
for training_prevalence in tqdm(np.linspace(0.1, 0.9, 9), total=9):
|
||||
training_size = 5000
|
||||
# since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained
|
||||
train_sample = train.sampling(training_size, 1-training_prevalence)
|
||||
|
||||
# cls = GridSearchCV(LinearSVC(), param_grid={'C': np.logspace(-2,2,5), 'class_weight':[None, 'balanced']}, n_jobs=-1)
|
||||
# cls = GridSearchCV(LogisticRegression(), param_grid={'C': np.logspace(-2, 2, 5), 'class_weight': [None, 'balanced']}, n_jobs=-1)
|
||||
# cls.fit(*train_sample.Xy)
|
||||
|
||||
model = CC(LogisticRegressionCV(n_jobs=-1,Cs=10))
|
||||
|
||||
model.fit(train_sample)
|
||||
true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0))
|
||||
method_name = 'CC$_{'+f'{int(100*training_prevalence)}' + '\%}$'
|
||||
method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence()))
|
||||
|
||||
return zip(*method_data)
|
||||
|
||||
|
||||
method_names, true_prevs, estim_prevs, tr_prevs = gen_data()
|
||||
|
||||
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots_cacm/bin_diag_cc.pdf')
|
||||
# qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=10, savepath='./plots_cacm/err_drift_cc.pdf', title='', show_density=False)
|
|
@ -72,7 +72,7 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
|
|||
train_prev = train_prev[pos_class]
|
||||
ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3)
|
||||
|
||||
ax.set(xlabel='true prevalence', ylabel='estimated prevalence', title=title)
|
||||
ax.set(xlabel='true frequency', ylabel='estimated frequency', title=title)
|
||||
ax.set_ylim(0, 1)
|
||||
ax.set_xlim(0, 1)
|
||||
|
||||
|
@ -216,9 +216,10 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
show_density=True,
|
||||
show_legend=True,
|
||||
logscale=False,
|
||||
title=f'Quantification error as a function of distribution shift',
|
||||
title=f'Quantification error as a function of label shift',
|
||||
vlines=None,
|
||||
method_order=None,
|
||||
fontsize=18,
|
||||
savepath=None):
|
||||
"""
|
||||
Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift
|
||||
|
@ -247,6 +248,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
:param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
|
||||
"""
|
||||
|
||||
plt.rcParams['font.size'] = fontsize
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
ax.grid()
|
||||
|
||||
|
@ -261,7 +264,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
if method_order is None:
|
||||
method_order = method_names
|
||||
|
||||
_set_colors(ax, n_methods=len(method_order))
|
||||
# _set_colors(ax, n_methods=len(method_order))
|
||||
|
||||
bins = np.linspace(0, 1, n_bins+1)
|
||||
binwidth = 1 / n_bins
|
||||
|
@ -291,6 +294,9 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
ys = np.asarray(ys)
|
||||
ystds = np.asarray(ystds)
|
||||
|
||||
# if ys[-1]<ys[-2]:
|
||||
# ys[-1] = ys[-2]+(abs(ys[-2]-ys[-3]))/2
|
||||
|
||||
min_x_method, max_x_method, min_y_method, max_y_method = xs.min(), xs.max(), ys.min(), ys.max()
|
||||
min_x = min_x_method if min_x is None or min_x_method < min_x else min_x
|
||||
max_x = max_x_method if max_x is None or max_x_method > max_x else max_x
|
||||
|
@ -313,8 +319,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
ax2.spines['right'].set_color('g')
|
||||
ax2.tick_params(axis='y', colors='g')
|
||||
|
||||
ax.set(xlabel=f'Distribution shift between training set and test sample',
|
||||
ylabel=f'{error_name.upper()} (true distribution, predicted distribution)',
|
||||
ax.set(xlabel=f'Amount of label shift',
|
||||
ylabel=f'Absolute error',
|
||||
title=title)
|
||||
box = ax.get_position()
|
||||
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
|
||||
|
@ -329,9 +335,10 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
|||
|
||||
|
||||
if show_legend:
|
||||
fig.legend(loc='lower center',
|
||||
bbox_to_anchor=(1, 0.5),
|
||||
ncol=(len(method_names)+1)//2)
|
||||
ax.legend(loc='center right', bbox_to_anchor=(1.31, 0.5))
|
||||
# fig.legend(loc='lower center',
|
||||
# bbox_to_anchor=(1, 0.5),
|
||||
# ncol=(len(method_names)+1)//2)
|
||||
|
||||
_save_or_show(savepath)
|
||||
|
||||
|
|
Loading…
Reference in New Issue