Compare commits
5 Commits
949bd3cca7
...
61399d66f6
Author | SHA1 | Date |
---|---|---|
Alejandro Moreo Fernandez | 61399d66f6 | |
Alejandro Moreo Fernandez | d060ffa591 | |
Alejandro Moreo Fernandez | 50d0ed2e84 | |
Alejandro Moreo Fernandez | 0f4008e18d | |
Alejandro Moreo Fernandez | 3243fd90f8 |
|
@ -8,7 +8,7 @@ from distribution_matching.method_dirichlety import DIRy
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from method_kdey_closed_efficient import KDEyclosed_efficient
|
from method_kdey_closed_efficient import KDEyclosed_efficient
|
||||||
|
|
||||||
METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+',
|
METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-DMhd3', 'DM-CS', 'KDEy-closed++', 'DIR', 'EMQ', 'KDEy-ML'] #['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+', 'EMQ-C',
|
||||||
BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
|
BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ def new_method(method, **lr_kwargs):
|
||||||
param_grid = hyper_LR
|
param_grid = hyper_LR
|
||||||
quantifier = PACC(lr)
|
quantifier = PACC(lr)
|
||||||
elif method == 'KDEy-ML':
|
elif method == 'KDEy-ML':
|
||||||
method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
quantifier = KDEy(lr, target='max_likelihood', val_split=10)
|
quantifier = KDEy(lr, target='max_likelihood', val_split=10)
|
||||||
elif method == 'KDEy-closed':
|
elif method == 'KDEy-closed':
|
||||||
|
@ -59,6 +59,13 @@ def new_method(method, **lr_kwargs):
|
||||||
elif method == 'EMQ':
|
elif method == 'EMQ':
|
||||||
param_grid = hyper_LR
|
param_grid = hyper_LR
|
||||||
quantifier = EMQ(lr)
|
quantifier = EMQ(lr)
|
||||||
|
elif method == 'EMQ-C':
|
||||||
|
method_params = {'exact_train_prev': [False], 'recalib': ['bcts']}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = EMQ(lr)
|
||||||
|
elif method == 'HDy':
|
||||||
|
param_grid = hyper_LR
|
||||||
|
quantifier = HDy(lr)
|
||||||
elif method == 'HDy-OvA':
|
elif method == 'HDy-OvA':
|
||||||
param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()}
|
param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()}
|
||||||
quantifier = OneVsAllAggregative(HDy(lr))
|
quantifier = OneVsAllAggregative(HDy(lr))
|
||||||
|
@ -70,6 +77,30 @@ def new_method(method, **lr_kwargs):
|
||||||
}
|
}
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
quantifier = DistributionMatching(lr)
|
quantifier = DistributionMatching(lr)
|
||||||
|
elif method == 'DM-T':
|
||||||
|
method_params = {
|
||||||
|
'nbins': [2,3,4,5,6,7,8,9,10,12,14,16,18,20,22,24,26,28,30,32,64],
|
||||||
|
'val_split': [10],
|
||||||
|
'divergence': ['topsoe']
|
||||||
|
}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = DistributionMatching(lr)
|
||||||
|
elif method == 'DM-HD':
|
||||||
|
method_params = {
|
||||||
|
'nbins': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64],
|
||||||
|
'val_split': [10],
|
||||||
|
'divergence': ['HD']
|
||||||
|
}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = DistributionMatching(lr)
|
||||||
|
elif method == 'DM-CS':
|
||||||
|
method_params = {
|
||||||
|
'nbins': [2,3,4,5,6,7,8,9,10,12,14,16,18,20,22,24,26,28,30,32,64],
|
||||||
|
'val_split': [10],
|
||||||
|
'divergence': ['CS']
|
||||||
|
}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = DistributionMatching(lr)
|
||||||
|
|
||||||
# experimental
|
# experimental
|
||||||
elif method in ['KDEy-DMkld']:
|
elif method in ['KDEy-DMkld']:
|
||||||
|
@ -95,7 +126,7 @@ def new_method(method, **lr_kwargs):
|
||||||
# can be stored. This means that the reference distribution is V and not T. Then I have found that an
|
# can be stored. This means that the reference distribution is V and not T. Then I have found that an
|
||||||
# f-divergence is defined as D(p||q) \int_{R^n}q(x)f(p(x)/q(x))dx = E_{x~q}[f(p(x)/q(x))], so if I am sampling
|
# f-divergence is defined as D(p||q) \int_{R^n}q(x)f(p(x)/q(x))dx = E_{x~q}[f(p(x)/q(x))], so if I am sampling
|
||||||
# V then I am computing D(T||V) (and not D(V||T) as I thought).
|
# V then I am computing D(T||V) (and not D(V||T) as I thought).
|
||||||
method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
|
quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
|
||||||
elif method == 'DM-HD':
|
elif method == 'DM-HD':
|
||||||
|
|
|
@ -9,18 +9,42 @@ Plots results for MAE, MRAE, and KLD
|
||||||
The rest of hyperparameters were set to their default values
|
The rest of hyperparameters were set to their default values
|
||||||
"""
|
"""
|
||||||
|
|
||||||
df_tweet = pd.read_csv('../results/tweet/sensibility/KDEy-ML.csv', sep='\t')
|
|
||||||
df_lequa = pd.read_csv('../results/lequa/sensibility/KDEy-ML.csv', sep='\t')
|
|
||||||
df = pd.concat([df_tweet, df_lequa])
|
|
||||||
|
|
||||||
for err in ['MAE', 'MRAE', 'KLD']:
|
|
||||||
piv = df.pivot_table(index='Bandwidth', columns='Dataset', values=err)
|
log_mrae = True
|
||||||
|
|
||||||
|
for method, param, xlim, xticks in [
|
||||||
|
('KDEy-ML', 'Bandwidth', (0.01, 0.2), np.linspace(0.01, 0.2, 20)),
|
||||||
|
('DM-HD', 'nbins', (2,32), list(range(2,10)) + list(range(10,34,2)))
|
||||||
|
]:
|
||||||
|
|
||||||
|
for dataset in ['tweet', 'lequa', 'uciml']:
|
||||||
|
|
||||||
|
if dataset == 'tweet':
|
||||||
|
df = pd.read_csv(f'../results/tweet/sensibility/{method}.csv', sep='\t')
|
||||||
|
ylim = (0.03, 0.21)
|
||||||
|
elif dataset == 'lequa':
|
||||||
|
df = pd.read_csv(f'../results/lequa/T1B/sensibility/{method}.csv', sep='\t')
|
||||||
|
ylim = (0.0125, 0.03)
|
||||||
|
elif dataset == 'uciml':
|
||||||
|
ylim = (0, 0.23)
|
||||||
|
df = pd.read_csv(f'../results/ucimulti/sensibility/{method}.csv', sep='\t')
|
||||||
|
|
||||||
|
for err in ['MAE']: #, 'MRAE']:
|
||||||
|
piv = df.pivot_table(index=param, columns='Dataset', values=err)
|
||||||
g = sns.lineplot(data=piv, markers=True, dashes=False)
|
g = sns.lineplot(data=piv, markers=True, dashes=False)
|
||||||
g.set(xlim=(0.01, 0.2))
|
g.set(xlim=xlim)
|
||||||
g.legend(loc="center left", bbox_to_anchor=(1, 0.5))
|
g.legend(loc="center left", bbox_to_anchor=(1, 0.5))
|
||||||
|
|
||||||
|
if log_mrae and err=='MRAE':
|
||||||
|
plt.yscale('log')
|
||||||
|
g.set_ylabel('log('+err+')')
|
||||||
|
else:
|
||||||
g.set_ylabel(err)
|
g.set_ylabel(err)
|
||||||
g.set_xticks(np.linspace(0.01, 0.2, 20))
|
|
||||||
|
g.set_ylim(ylim)
|
||||||
|
g.set_xticks(xticks)
|
||||||
plt.xticks(rotation=90)
|
plt.xticks(rotation=90)
|
||||||
plt.grid()
|
plt.grid()
|
||||||
plt.savefig(f'./sensibility_{err}.pdf', bbox_inches='tight')
|
plt.savefig(f'./sensibility_{method}_{dataset}_{err}.pdf', bbox_inches='tight')
|
||||||
plt.clf()
|
plt.clf()
|
|
@ -1,10 +1,8 @@
|
||||||
import ternary
|
|
||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from sklearn.neighbors import KernelDensity
|
from sklearn.neighbors import KernelDensity
|
||||||
import plotly.figure_factory as ff
|
|
||||||
|
|
||||||
from data import LabelledCollection
|
from data import LabelledCollection
|
||||||
|
|
||||||
|
@ -15,6 +13,7 @@ scale = 10
|
||||||
# con plotly salen los contornos bien, pero es un poco un jaleo porque utiliza el navegador...
|
# con plotly salen los contornos bien, pero es un poco un jaleo porque utiliza el navegador...
|
||||||
|
|
||||||
def plot_simplex_(ax, density, title='', fontsize=9, points=None):
|
def plot_simplex_(ax, density, title='', fontsize=9, points=None):
|
||||||
|
import ternary
|
||||||
|
|
||||||
tax = ternary.TernaryAxesSubplot(ax=ax, scale=scale)
|
tax = ternary.TernaryAxesSubplot(ax=ax, scale=scale)
|
||||||
tax.heatmapf(density, boundary=True, style="triangular", colorbar=False, cmap='viridis') #cmap='magma')
|
tax.heatmapf(density, boundary=True, style="triangular", colorbar=False, cmap='viridis') #cmap='magma')
|
||||||
|
@ -34,6 +33,7 @@ def plot_simplex_(ax, density, title='', fontsize=9, points=None):
|
||||||
|
|
||||||
|
|
||||||
def plot_simplex(ax, coord, kde_scores, title='', fontsize=11, points=None, savepath=None):
|
def plot_simplex(ax, coord, kde_scores, title='', fontsize=11, points=None, savepath=None):
|
||||||
|
import plotly.figure_factory as ff
|
||||||
|
|
||||||
tax = ff.create_ternary_contour(coord.T, kde_scores, pole_labels=['y=1', 'y=2', 'y=3'],
|
tax = ff.create_ternary_contour(coord.T, kde_scores, pole_labels=['y=1', 'y=2', 'y=3'],
|
||||||
interp_mode='cartesian',
|
interp_mode='cartesian',
|
||||||
|
@ -49,6 +49,8 @@ def plot_simplex(ax, coord, kde_scores, title='', fontsize=11, points=None, save
|
||||||
|
|
||||||
from mpl_toolkits.axes_grid1 import make_axes_locatable
|
from mpl_toolkits.axes_grid1 import make_axes_locatable
|
||||||
def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
|
def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
|
||||||
|
import ternary
|
||||||
|
|
||||||
post_c1 = np.flip(post_c1, axis=1)
|
post_c1 = np.flip(post_c1, axis=1)
|
||||||
post_c2 = np.flip(post_c2, axis=1)
|
post_c2 = np.flip(post_c2, axis=1)
|
||||||
post_c3 = np.flip(post_c3, axis=1)
|
post_c3 = np.flip(post_c3, axis=1)
|
||||||
|
|
|
@ -1,56 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
import os
|
|
||||||
import pandas as pd
|
|
||||||
import quapy as qp
|
|
||||||
from method_kdey import KDEy
|
|
||||||
|
|
||||||
|
|
||||||
SEED=1
|
|
||||||
|
|
||||||
def task(bandwidth):
|
|
||||||
print('job-init', dataset, bandwidth)
|
|
||||||
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
|
|
||||||
|
|
||||||
with qp.util.temp_seed(SEED):
|
|
||||||
quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10, bandwidth=bandwidth)
|
|
||||||
quantifier.fit(train)
|
|
||||||
report = qp.evaluation.evaluation_report(
|
|
||||||
quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
|
|
||||||
return report
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
|
|
||||||
qp.environ['N_JOBS'] = -1
|
|
||||||
result_dir = f'results_lequa_sensibility'
|
|
||||||
|
|
||||||
os.makedirs(result_dir, exist_ok=True)
|
|
||||||
|
|
||||||
method = 'KDEy-MLE'
|
|
||||||
|
|
||||||
global_result_path = f'{result_dir}/{method}'
|
|
||||||
|
|
||||||
if not os.path.exists(global_result_path+'.csv'):
|
|
||||||
with open(global_result_path+'.csv', 'wt') as csv:
|
|
||||||
csv.write(f'Method\tDataset\tBandwidth\tMAE\tMRAE\tKLD\n')
|
|
||||||
|
|
||||||
dataset = 'T1B'
|
|
||||||
bandwidths = np.linspace(0.01, 0.2, 20)
|
|
||||||
|
|
||||||
reports = qp.util.parallel(task, bandwidths, n_jobs=-1)
|
|
||||||
with open(global_result_path + '.csv', 'at') as csv:
|
|
||||||
for bandwidth, report in zip(bandwidths, reports):
|
|
||||||
means = report.mean()
|
|
||||||
local_result_path = global_result_path + '_' + dataset + f'_{bandwidth:.3f}'
|
|
||||||
report.to_csv(f'{local_result_path}.dataframe')
|
|
||||||
csv.write(f'{method}\tLeQua-T1B\t{bandwidth}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
|
||||||
csv.flush()
|
|
||||||
|
|
||||||
df = pd.read_csv(global_result_path + '.csv', sep='\t')
|
|
||||||
|
|
||||||
pd.set_option('display.max_columns', None)
|
|
||||||
pd.set_option('display.max_rows', None)
|
|
||||||
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
|
|
||||||
print(pv)
|
|
|
@ -2,24 +2,24 @@ import pickle
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from distribution_matching.commons import METHODS, new_method, show_results
|
from distribution_matching.commons import METHODS, BIN_METHODS, new_method, show_results
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from quapy.model_selection import GridSearchQ
|
from quapy.model_selection import GridSearchQ
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
|
for task in ['T1A', 'T1B']:
|
||||||
|
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE[task]
|
||||||
qp.environ['N_JOBS'] = -1
|
qp.environ['N_JOBS'] = -1
|
||||||
for optim in ['mae', 'mrae']:
|
for optim in ['mae', 'mrae']:
|
||||||
|
|
||||||
result_dir = f'results/lequa/{optim}'
|
result_dir = f'results/lequa/{task}/{optim}'
|
||||||
|
|
||||||
os.makedirs(result_dir, exist_ok=True)
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
for method in METHODS:
|
for method in (METHODS if task=='T1B' else BIN_METHODS):
|
||||||
|
|
||||||
print('Init method', method)
|
print('Init method', method)
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ if __name__ == '__main__':
|
||||||
with open(result_path+'.csv', 'wt') as csv:
|
with open(result_path+'.csv', 'wt') as csv:
|
||||||
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
||||||
|
|
||||||
dataset = 'T1B'
|
dataset = task
|
||||||
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
|
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
|
||||||
print(f'init {dataset} #instances: {len(train)}')
|
print(f'init {dataset} #instances: {len(train)}')
|
||||||
param_grid, quantifier = new_method(method)
|
param_grid, quantifier = new_method(method)
|
||||||
|
@ -58,7 +58,7 @@ if __name__ == '__main__':
|
||||||
)
|
)
|
||||||
means = report.mean()
|
means = report.mean()
|
||||||
report.to_csv(result_path+'.dataframe')
|
report.to_csv(result_path+'.dataframe')
|
||||||
csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
csv.write(f'{method}\tLeQua-{task}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
csv.flush()
|
csv.flush()
|
||||||
print(means)
|
print(means)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,56 @@
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import os
|
||||||
|
import quapy as qp
|
||||||
|
from distribution_matching.commons import show_results
|
||||||
|
from method_kdey import KDEy
|
||||||
|
from quapy.method.aggregative import DistributionMatching
|
||||||
|
|
||||||
|
|
||||||
|
SEED=1
|
||||||
|
|
||||||
|
def task(val):
|
||||||
|
print('job-init', val)
|
||||||
|
train, val_gen, test_gen = qp.datasets.fetch_lequa2022('T1B')
|
||||||
|
|
||||||
|
with qp.util.temp_seed(SEED):
|
||||||
|
if method=='KDEy-ML':
|
||||||
|
quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10, bandwidth=val)
|
||||||
|
elif method == 'DM-HD':
|
||||||
|
quantifier = DistributionMatching(LogisticRegression(), val_split=10, nbins=val, divergence='HD')
|
||||||
|
|
||||||
|
quantifier.fit(train)
|
||||||
|
report = qp.evaluation.evaluation_report(
|
||||||
|
quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
result_dir = f'results/lequa/T1B/sensibility'
|
||||||
|
|
||||||
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for method, param, grid in [
|
||||||
|
('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)),
|
||||||
|
('DM-HD', 'nbins', list(range(2, 10)) + list(range(10, 34, 2)))
|
||||||
|
]:
|
||||||
|
|
||||||
|
global_result_path = f'{result_dir}/{method}'
|
||||||
|
|
||||||
|
if not os.path.exists(global_result_path+'.csv'):
|
||||||
|
with open(global_result_path+'.csv', 'wt') as csv:
|
||||||
|
csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n')
|
||||||
|
|
||||||
|
reports = qp.util.parallel(task, grid, n_jobs=-1)
|
||||||
|
with open(global_result_path + '.csv', 'at') as csv:
|
||||||
|
for val, report in zip(grid, reports):
|
||||||
|
means = report.mean()
|
||||||
|
local_result_path = global_result_path + '_T1B' + (f'_{val:.3f}' if isinstance(val, float) else f'{val}')
|
||||||
|
report.to_csv(f'{local_result_path}.dataframe')
|
||||||
|
csv.write(f'{method}\tLeQua-T1B\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
show_results(global_result_path)
|
|
@ -21,7 +21,7 @@ import dirichlet
|
||||||
|
|
||||||
class DIRy(AggregativeProbabilisticQuantifier):
|
class DIRy(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
MAXITER = 10000
|
MAXITER = 100000
|
||||||
|
|
||||||
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None, target='max_likelihood'):
|
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None, target='max_likelihood'):
|
||||||
self.classifier = classifier
|
self.classifier = classifier
|
||||||
|
@ -38,7 +38,12 @@ class DIRy(AggregativeProbabilisticQuantifier):
|
||||||
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
)
|
)
|
||||||
|
|
||||||
self.val_parameters = [dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER) for cat in range(data.n_classes)]
|
self.val_parameters = []
|
||||||
|
for cat in range(data.n_classes):
|
||||||
|
dir_i = dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER)
|
||||||
|
self.val_parameters.append(dir_i)
|
||||||
|
# print(cat)
|
||||||
|
# self.val_parameters = [dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER) for cat in range(data.n_classes)]
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,82 @@
|
||||||
|
import numpy as np
|
||||||
|
from scipy.stats import norm, uniform, expon
|
||||||
|
from scipy.special import rel_entr
|
||||||
|
|
||||||
|
|
||||||
|
def kld_approx(p, q, resolution=100):
|
||||||
|
steps = np.linspace(-5, 5, resolution)
|
||||||
|
integral = 0
|
||||||
|
for step_i1, step_i2 in zip(steps[:-1], steps[1:]):
|
||||||
|
base = step_i2-step_i1
|
||||||
|
middle = (step_i1+step_i2)/2
|
||||||
|
pi = p.pdf(middle)
|
||||||
|
qi = q.pdf(middle)
|
||||||
|
integral += (base * pi*np.log(pi/qi))
|
||||||
|
return integral
|
||||||
|
|
||||||
|
|
||||||
|
def integrate(f, resolution=10000):
|
||||||
|
steps = np.linspace(-10, 10, resolution)
|
||||||
|
integral = 0
|
||||||
|
for step_i1, step_i2 in zip(steps[:-1], steps[1:]):
|
||||||
|
base = step_i2 - step_i1
|
||||||
|
middle = (step_i1 + step_i2) / 2
|
||||||
|
integral += (base * f(middle))
|
||||||
|
return integral
|
||||||
|
|
||||||
|
|
||||||
|
def kl_analytic(m1, s1, m2, s2):
|
||||||
|
return np.log(s2/s1)+ (s1*s1 + (m1-m2)**2) / (2*s2*s2) - 0.5
|
||||||
|
|
||||||
|
|
||||||
|
def montecarlo(p, q, trials=1000000):
|
||||||
|
xs = p.rvs(trials)
|
||||||
|
ps = p.pdf(xs)
|
||||||
|
qs = q.pdf(xs)
|
||||||
|
return np.mean(np.log(ps/qs))
|
||||||
|
|
||||||
|
|
||||||
|
def montecarlo2(p, q, trials=100000):
|
||||||
|
#r = norm(-3, scale=5)
|
||||||
|
r = uniform(-10, 20)
|
||||||
|
# r = expon(loc=-6)
|
||||||
|
#r = p
|
||||||
|
xs = r.rvs(trials)
|
||||||
|
rs = r.pdf(xs)
|
||||||
|
print(rs)
|
||||||
|
ps = p.pdf(xs)+0.0000001
|
||||||
|
qs = q.pdf(xs)+0.0000001
|
||||||
|
return np.mean((ps/rs)*np.log(ps/qs))
|
||||||
|
|
||||||
|
|
||||||
|
def wrong(p, q, trials=100000):
|
||||||
|
xs = np.random.uniform(-10,10, trials)
|
||||||
|
ps = p.pdf(xs)
|
||||||
|
qs = q.pdf(xs)
|
||||||
|
return np.mean(ps*np.log(ps/qs))
|
||||||
|
|
||||||
|
|
||||||
|
p = norm(loc=0, scale=1)
|
||||||
|
q = norm(loc=1, scale=1)
|
||||||
|
|
||||||
|
|
||||||
|
integral_approx = kld_approx(p, q)
|
||||||
|
analytic_solution = kl_analytic(p.kwds['loc'], p.kwds['scale'], q.kwds['loc'], q.kwds['scale'])
|
||||||
|
montecarlo_approx = montecarlo(p, q)
|
||||||
|
montecarlo_approx2 = montecarlo2(p, q)
|
||||||
|
wrong_approx = wrong(p, q)
|
||||||
|
|
||||||
|
print(f'{analytic_solution=:.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{integral_approx=:.10f}')
|
||||||
|
print(f'integra error = {np.abs(analytic_solution-integral_approx):.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{montecarlo_approx=:.10f}')
|
||||||
|
print(f'montecarlo error = {np.abs(analytic_solution-montecarlo_approx):.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{montecarlo_approx2=:.10f}')
|
||||||
|
print(f'montecarlo2 error = {np.abs(analytic_solution-montecarlo_approx2):.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{wrong_approx=:.10f}')
|
||||||
|
print(f'wrong error = {np.abs(analytic_solution-wrong_approx):.10f}')
|
||||||
|
|
|
@ -0,0 +1,113 @@
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
from scipy.stats import norm, multivariate_normal, Covariance
|
||||||
|
from sklearn.metrics.pairwise import rbf_kernel
|
||||||
|
from sklearn.neighbors import KDTree
|
||||||
|
|
||||||
|
# def CauchySchwarzDivGaussMix(pi_i, mu_i, Lambda_i, tau_i, nu_i, Omega_i):
|
||||||
|
# Lambda_i_inv =
|
||||||
|
|
||||||
|
|
||||||
|
# def z()
|
||||||
|
|
||||||
|
|
||||||
|
nD=2
|
||||||
|
mu = [[0.1,0.1], [0.2, 0.3], [1, 1]] # centers
|
||||||
|
bandwidth = 0.10 # bandwidth and scale (standard deviation)
|
||||||
|
standard_deviation = bandwidth
|
||||||
|
variance = standard_deviation**2
|
||||||
|
|
||||||
|
mus = np.asarray(mu).reshape(-1,nD)
|
||||||
|
kde = KernelDensity(bandwidth=bandwidth).fit(mus)
|
||||||
|
|
||||||
|
x = [0.5,0.2]
|
||||||
|
|
||||||
|
print('with scikit-learn KDE')
|
||||||
|
prob = np.exp(kde.score_samples(np.asarray(x).reshape(-1,nD)))
|
||||||
|
print(prob)
|
||||||
|
|
||||||
|
# univariate
|
||||||
|
# N = norm(loc=mu, scale=scale)
|
||||||
|
# prob = N.pdf(x)
|
||||||
|
# print(prob)
|
||||||
|
|
||||||
|
# multivariate
|
||||||
|
|
||||||
|
print('with scipy multivariate normal')
|
||||||
|
npoints = mus.shape[0]
|
||||||
|
probs = sum(multivariate_normal(mean=mu_i, cov=variance).pdf(x) for mu_i in mu)
|
||||||
|
probs /= npoints
|
||||||
|
print(probs)
|
||||||
|
|
||||||
|
print('with scikit learn rbf_kernel')
|
||||||
|
x = np.asarray(x).reshape(-1,nD)
|
||||||
|
gamma = 1/(2*variance)
|
||||||
|
gram = rbf_kernel(mus, x, gamma=gamma)
|
||||||
|
const = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
|
||||||
|
gram *= const
|
||||||
|
print(gram)
|
||||||
|
print(np.sum(gram)/npoints)
|
||||||
|
|
||||||
|
print('with stackoverflow answer')
|
||||||
|
from scipy.spatial.distance import pdist, cdist, squareform
|
||||||
|
import scipy
|
||||||
|
# this is an NxD matrix, where N is number of items and D its dimensionalites
|
||||||
|
pairwise_sq_dists = cdist(mus, x, 'euclidean')
|
||||||
|
print(pairwise_sq_dists)
|
||||||
|
K = np.exp(-pairwise_sq_dists / variance)
|
||||||
|
print(K)
|
||||||
|
print(np.sum(K)/npoints)
|
||||||
|
|
||||||
|
print("trying with scipy multivariate on more than one instance")
|
||||||
|
probs = sum(multivariate_normal(mean=x.reshape(-nD), cov=variance).pdf(mus))
|
||||||
|
probs /= npoints
|
||||||
|
print(probs)
|
||||||
|
|
||||||
|
import sys
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# N1 = multivariate_normal(mean=mu[0], cov=variance)
|
||||||
|
# prob1 = N1.pdf(x)
|
||||||
|
# N2 = multivariate_normal(mean=mu[1], cov=variance)
|
||||||
|
# prob2 = N2.pdf(x)
|
||||||
|
# print(prob1+prob2)
|
||||||
|
|
||||||
|
cov = Covariance.from_diagonal([variance, variance])
|
||||||
|
print(cov.covariance)
|
||||||
|
precision_matrix = np.asarray([[1/variance, 0],[0, 1/variance]])
|
||||||
|
print(Covariance.from_precision(precision_matrix).covariance)
|
||||||
|
print(np.linalg.inv(precision_matrix))
|
||||||
|
print(np.linalg.inv(cov.covariance))
|
||||||
|
|
||||||
|
print('-'*100)
|
||||||
|
|
||||||
|
nD=2
|
||||||
|
mu = np.asarray([[0.1,0.5]]) # centers
|
||||||
|
bandwidth = 0.10 # bandwidth and scale (standard deviation)
|
||||||
|
standard_deviation = bandwidth
|
||||||
|
variance = standard_deviation**2
|
||||||
|
|
||||||
|
mus = np.asarray([mu]).reshape(-1,nD)
|
||||||
|
kde = KernelDensity(bandwidth=bandwidth).fit(mus)
|
||||||
|
|
||||||
|
x = np.asarray([0.5,0.2])
|
||||||
|
|
||||||
|
prob = np.exp(kde.score_samples(np.asarray(x).reshape(-1,nD)))
|
||||||
|
print(prob)
|
||||||
|
|
||||||
|
probs = sum(multivariate_normal(mean=mu_i, cov=variance).pdf(x) for mu_i in mu) / len(mu)
|
||||||
|
probs = sum(multivariate_normal(mean=[0,0], cov=1).pdf((x-mu_i)/bandwidth) for mu_i in mu) / len(mu)
|
||||||
|
probs = sum(multivariate_normal(mean=[0,0], cov=variance).pdf(x-mu_i) for mu_i in mu) / len(mu)
|
||||||
|
print(probs)
|
||||||
|
# N1 = multivariate_normal(mean=mu[0], cov=variance)
|
||||||
|
# prob1 = N1.pdf(x)
|
||||||
|
# N2 = multivariate_normal(mean=mu[1], cov=variance)
|
||||||
|
# prob2 = N2.pdf(x)
|
||||||
|
# print(prob1+prob2)
|
||||||
|
|
||||||
|
h=0.1
|
||||||
|
D=4
|
||||||
|
|
||||||
|
print(np.sqrt((4**2) * (5**2)))
|
||||||
|
print(4*5)
|
||||||
|
|
|
@ -0,0 +1,153 @@
|
||||||
|
import numpy as np
|
||||||
|
from scipy.stats import multivariate_normal
|
||||||
|
from scipy import optimize
|
||||||
|
|
||||||
|
|
||||||
|
def cauchy_schwarz_divergence_kde(L:list, Xte:np.ndarray, bandwidth:float, alpha:np.ndarray):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param L: a list of np.ndarray (instances x dimensions) with the Li being the instances of class i
|
||||||
|
:param Xte: an np.ndarray (instances x dimensions)
|
||||||
|
:param bandwidth: the bandwidth of the kernel
|
||||||
|
:param alpha: the mixture parameter
|
||||||
|
:return: the Cauchy-Schwarz divergence between the validation KDE mixture distribution (with mixture paramerter
|
||||||
|
alpha) and the test KDE distribution
|
||||||
|
"""
|
||||||
|
|
||||||
|
n = len(L) # number of classes
|
||||||
|
K, D = Xte.shape # number of test instances, and number of dimensions
|
||||||
|
Kinv = 1/K
|
||||||
|
|
||||||
|
# the lengths of each block
|
||||||
|
l = np.asarray([len(Li) for Li in L])
|
||||||
|
|
||||||
|
# contains the a_i / l_i
|
||||||
|
alpha_r = alpha / l
|
||||||
|
alpha2_r_sum = np.sum(alpha * alpha_r) # contains the sum_i a_i**2 / l_i
|
||||||
|
|
||||||
|
h = bandwidth
|
||||||
|
|
||||||
|
# we will only use the bandwidth (h) between two gaussians with covariance matrix a "scalar matrix" h**2
|
||||||
|
cov_mix_scalar = 2*h*h # corresponds to a bandwidth of sqrt(2)*h
|
||||||
|
|
||||||
|
# constant
|
||||||
|
C = ((2*np.pi)**(-D/2))*h**(-D)
|
||||||
|
|
||||||
|
Kernel = multivariate_normal(mean=np.zeros(D), cov=cov_mix_scalar)
|
||||||
|
K0 = Kernel.pdf(np.zeros(D))
|
||||||
|
|
||||||
|
|
||||||
|
def compute_block_E():
|
||||||
|
kernel_block_E = []
|
||||||
|
for i,Li in enumerate(L):
|
||||||
|
acc = 0
|
||||||
|
for x_ji in Li: #optimize...
|
||||||
|
for x_k in Xte: #optimize...
|
||||||
|
acc += Kernel.pdf(x_ji - x_k) #optimize...
|
||||||
|
kernel_block_E.append(acc)
|
||||||
|
return np.asarray(kernel_block_E)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_block_F_hash():
|
||||||
|
# this can be computed entirely at training time
|
||||||
|
Khash = {}
|
||||||
|
for a in range(n):
|
||||||
|
for b in range(l[a]):
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(l[i]): # this for, and index j, can be supressed and store the sum across j
|
||||||
|
Khash[(a,b,i,j)] = Kernel.pdf(L[i][j]-L[a][b])
|
||||||
|
return Khash
|
||||||
|
|
||||||
|
|
||||||
|
def compute_block_Ktest():
|
||||||
|
# this can be optimized in several ways, starting by computing only the lower diagonal triangle... and remove
|
||||||
|
# then the K0 which is not needed after that
|
||||||
|
acc = 0
|
||||||
|
for x_i in Xte:
|
||||||
|
for x_j in Xte:
|
||||||
|
acc += Kernel.pdf(x_i-x_j)
|
||||||
|
return acc
|
||||||
|
|
||||||
|
|
||||||
|
def compute_block_F():
|
||||||
|
F = 0
|
||||||
|
for a in range(n):
|
||||||
|
tmp_b = 0
|
||||||
|
for b in range(l[a]):
|
||||||
|
tmp_i = 0
|
||||||
|
for i in range(n):
|
||||||
|
tmp_j = 0
|
||||||
|
for j in range(l[i]):
|
||||||
|
tmp_j += Fh[(a, b, i, j)]
|
||||||
|
tmp_i += (alpha_r[i] * tmp_j)
|
||||||
|
tmp_b += tmp_i
|
||||||
|
F += (alpha_r[a] * tmp_b)
|
||||||
|
return F
|
||||||
|
|
||||||
|
|
||||||
|
E = compute_block_E()
|
||||||
|
Fh = compute_block_F_hash()
|
||||||
|
# Ktest = compute_block_Ktest()
|
||||||
|
F = compute_block_F()
|
||||||
|
|
||||||
|
C1 = K*Kinv*Kinv*C
|
||||||
|
C2 = 2 * np.sum([Kernel.pdf(Xte[k]-Xte[k_p]) for k in range(K) for k_p in range(k)])
|
||||||
|
|
||||||
|
partA = -np.log(Kinv * (alpha_r @ E))
|
||||||
|
partB = 0.5*np.log(C*alpha2_r_sum + F - (K0*alpha2_r_sum))
|
||||||
|
# partC = 0.5*np.log(Kinv) + 0.5*np.log(C + Kinv*Ktest - K0)
|
||||||
|
partC = 0.5*np.log(C1+C2)
|
||||||
|
|
||||||
|
Dcs = partA + partB + partC
|
||||||
|
|
||||||
|
return Dcs
|
||||||
|
|
||||||
|
|
||||||
|
L = [
|
||||||
|
np.asarray([
|
||||||
|
[-1,-1,-1]
|
||||||
|
]),
|
||||||
|
np.asarray([
|
||||||
|
[0,0,0],
|
||||||
|
]),
|
||||||
|
np.asarray([
|
||||||
|
[0,0,0.1],
|
||||||
|
[1,1,1],
|
||||||
|
[3,3,1],
|
||||||
|
]),
|
||||||
|
np.asarray([
|
||||||
|
[1,0,0]
|
||||||
|
]),
|
||||||
|
np.asarray([
|
||||||
|
[0,1,0]
|
||||||
|
])
|
||||||
|
]
|
||||||
|
Xte = np.asarray(
|
||||||
|
[[0,0,0],
|
||||||
|
[0,0,0],
|
||||||
|
[1,0,0],
|
||||||
|
[0,1,0]]
|
||||||
|
)
|
||||||
|
bandwidth=0.01
|
||||||
|
alpha=np.asarray([0, 2/4, 0, 1/4, 1/4])
|
||||||
|
|
||||||
|
div = cauchy_schwarz_divergence_kde(L, Xte, bandwidth, alpha)
|
||||||
|
print(div)
|
||||||
|
|
||||||
|
def divergence(alpha):
|
||||||
|
return cauchy_schwarz_divergence_kde(L, Xte, bandwidth, alpha)
|
||||||
|
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
n_classes = len(L)
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
#print('searching for alpha')
|
||||||
|
r = optimize.minimize(divergence, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
sol = r.x
|
||||||
|
for x in sol:
|
||||||
|
print(f'{x:.4f}')
|
||||||
|
print(cauchy_schwarz_divergence_kde(L, Xte, bandwidth, sol))
|
|
@ -0,0 +1,17 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
dataset = 'lequa/T1A'
|
||||||
|
for metric in ['mae', 'mrae']:
|
||||||
|
method1 = 'KDEy-closed++'
|
||||||
|
# method2 = 'KDEy-DMhd3+'
|
||||||
|
# method1 = 'KDEy-ML'
|
||||||
|
# method2 = 'KDEy-ML+'
|
||||||
|
|
||||||
|
path = f'../results/{dataset}/{metric}/{method1}.hyper.pkl'
|
||||||
|
obj = pickle.load(open(path, 'rb'))
|
||||||
|
print(method1, obj)
|
||||||
|
|
||||||
|
# path = f'../results/{dataset}/{metric}/{method2}.hyper.pkl'
|
||||||
|
# obj = pickle.load(open(path, 'rb'))
|
||||||
|
# print(method2, obj)
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
from time import time
|
||||||
|
from sklearn.metrics.pairwise import rbf_kernel
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def rbf(X, Y):
|
||||||
|
return X @ Y.T
|
||||||
|
|
||||||
|
|
||||||
|
@torch.compile
|
||||||
|
def rbf_comp(X, Y):
|
||||||
|
return X @ Y.T
|
||||||
|
|
||||||
|
|
||||||
|
def measure_time(X, Y, func):
|
||||||
|
tinit = time()
|
||||||
|
func(X, Y)
|
||||||
|
tend = time()
|
||||||
|
print(f'took {tend-tinit:.3}s')
|
||||||
|
|
||||||
|
|
||||||
|
X = np.random.rand(1000, 100)
|
||||||
|
Y = np.random.rand(1000, 100)
|
||||||
|
|
||||||
|
measure_time(X, Y, rbf)
|
||||||
|
measure_time(X, Y, rbf_comp)
|
|
@ -0,0 +1,20 @@
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.model_selection import GridSearchQ
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
|
||||||
|
|
||||||
|
toprint = []
|
||||||
|
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||||
|
|
||||||
|
# model selection
|
||||||
|
train, test = data.train_test
|
||||||
|
toprint.append(f'{dataset}\t{len(train)}\t{len(test)}\t{data.n_classes}')
|
||||||
|
|
||||||
|
print()
|
||||||
|
for pr in toprint:
|
||||||
|
print(pr)
|
|
@ -0,0 +1,6 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
report = pd.read_csv('../results_lequa_mrae/KDEy-MLE.dataframe')
|
||||||
|
|
||||||
|
means = report.mean()
|
||||||
|
print(f'KDEy-MLE\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
|
@ -0,0 +1,155 @@
|
||||||
|
import numpy as np
|
||||||
|
from scipy.stats import norm
|
||||||
|
|
||||||
|
|
||||||
|
EPS=1e-8
|
||||||
|
LARGE_NUM=1000000
|
||||||
|
TRIALS=LARGE_NUM
|
||||||
|
|
||||||
|
# calculamos: D(p||q)
|
||||||
|
# se asume:
|
||||||
|
# q = distribución de referencia (en nuestro caso: el mixture de KDEs en training)
|
||||||
|
# p = distribución (en nuestro caso: KDE del test)
|
||||||
|
# N = TRIALS el número de trials para montecarlo
|
||||||
|
# n = numero de clases, en este ejemplo n=3
|
||||||
|
|
||||||
|
|
||||||
|
# ejemplo de f-generator function: Squared Hellinger Distance
|
||||||
|
def hd2(u):
|
||||||
|
v = (np.sqrt(u)-1)
|
||||||
|
return v*v
|
||||||
|
|
||||||
|
|
||||||
|
# esta funcion estima la divergencia entre dos mixtures (univariates)
|
||||||
|
def integrate(p, q, f=hd2, resolution=TRIALS, epsilon=EPS):
|
||||||
|
xs = np.linspace(-50, 50, resolution)
|
||||||
|
ps = p.pdf(xs)+epsilon
|
||||||
|
qs = q.pdf(xs)+epsilon
|
||||||
|
rs = ps/qs
|
||||||
|
base = xs[1]-xs[0]
|
||||||
|
return np.sum(base * f(rs) * qs)
|
||||||
|
|
||||||
|
|
||||||
|
# esta es la aproximación de montecarlo según está en el artículo
|
||||||
|
# es decir, sampleando la q, y haciendo la media (1/N). En el artículo
|
||||||
|
# en realidad se "presamplean" N por cada clase y luego se eligen los que
|
||||||
|
# corresponda a cada clase. Aquí directamente sampleamos solo los
|
||||||
|
# que correspondan.
|
||||||
|
def montecarlo(p, q, f=hd2, trials=TRIALS, epsilon=EPS):
|
||||||
|
xs = q.rvs(trials)
|
||||||
|
ps = p.pdf(xs)+epsilon
|
||||||
|
qs = q.pdf(xs)+epsilon
|
||||||
|
N = trials
|
||||||
|
return (1/N)*np.sum(f(ps/qs))
|
||||||
|
|
||||||
|
|
||||||
|
# esta es la aproximación de montecarlo asumiendo que vayamos con todos los N*n
|
||||||
|
# puntos (es decir, si hubieramos sampleado de una "q" en la que todas las clases
|
||||||
|
# eran igualmente probables, 1/3) y luego reescalamos los pesos con "importance
|
||||||
|
# weighting" <-- esto no está en el artículo, pero me gusta más que lo que hay
|
||||||
|
# porque desaparece el paso de "seleccionar los que corresponden por clase"
|
||||||
|
def montecarlo_importancesampling(p, q, r, f=hd2, trials=TRIALS, epsilon=EPS):
|
||||||
|
xs = r.rvs(trials) # <- distribución de referencia (aquí: class weight uniforme)
|
||||||
|
rs = r.pdf(xs)
|
||||||
|
ps = p.pdf(xs)+epsilon
|
||||||
|
qs = q.pdf(xs)+epsilon
|
||||||
|
N = trials
|
||||||
|
importance_weight = qs/rs
|
||||||
|
return (1/N)*np.sum(f(ps/qs)*(importance_weight))
|
||||||
|
|
||||||
|
|
||||||
|
# he intentado implementar la variante que propne Juanjo pero creo que no la he
|
||||||
|
# entendido bien. Tal vez haya que reescalar el peso por 1/3 o algo así, pero no
|
||||||
|
# doy con la tecla...
|
||||||
|
def montecarlo_classweight(p, q, f=hd2, trials=TRIALS, epsilon=EPS):
|
||||||
|
xs_1 = q.rvs_fromclass(0, trials)
|
||||||
|
xs_2 = q.rvs_fromclass(1, trials)
|
||||||
|
xs_3 = q.rvs_fromclass(2, trials)
|
||||||
|
xs = np.concatenate([xs_1, xs_2, xs_3])
|
||||||
|
weights = np.asarray([[alpha_i]*trials for alpha_i in q.alphas]).flatten()
|
||||||
|
ps = p.pdf(xs)+epsilon
|
||||||
|
qs = q.pdf(xs)+epsilon
|
||||||
|
N = trials
|
||||||
|
n = q.n
|
||||||
|
return (1/(N*n))*np.sum(weights*f(ps/qs))
|
||||||
|
|
||||||
|
|
||||||
|
class Q:
|
||||||
|
"""
|
||||||
|
Esta clase es un mixture de gausianas.
|
||||||
|
|
||||||
|
:param locs: medias, una por clase
|
||||||
|
:param scales: stds, una por clase
|
||||||
|
:param alphas: peso, uno por clase
|
||||||
|
"""
|
||||||
|
def __init__(self, locs, scales, alphas):
|
||||||
|
self.qs = []
|
||||||
|
for loc, scale in zip(locs, scales):
|
||||||
|
self.qs.append(norm(loc=loc, scale=scale))
|
||||||
|
assert np.isclose(np.sum(alphas), 1), 'alphas do not sum up to 1!'
|
||||||
|
self.alphas = np.asarray(alphas) / np.sum(alphas)
|
||||||
|
|
||||||
|
def pdf(self, xs):
|
||||||
|
q_xs = np.vstack([
|
||||||
|
q_i.pdf(xs) * alpha_i for q_i, alpha_i in zip(self.qs, self.alphas)
|
||||||
|
])
|
||||||
|
v = q_xs.sum(axis=0)
|
||||||
|
if len(v)==1:
|
||||||
|
v = v[0]
|
||||||
|
return v
|
||||||
|
|
||||||
|
@property
|
||||||
|
def n(self):
|
||||||
|
return len(self.alphas)
|
||||||
|
|
||||||
|
def rvs_fromclass(self, inclass, trials):
|
||||||
|
return self.qs[inclass].rvs(trials)
|
||||||
|
|
||||||
|
def rvs(self, trials):
|
||||||
|
variates = []
|
||||||
|
added = 0
|
||||||
|
for i, (q_i, alpha_i) in enumerate(zip(self.qs, self.alphas)):
|
||||||
|
trials_i = int(trials*alpha_i) if (i < self.n-1) else trials-added
|
||||||
|
variates_i = q_i.rvs(trials_i)
|
||||||
|
variates.append(variates_i)
|
||||||
|
added += len(variates_i)
|
||||||
|
|
||||||
|
return np.concatenate(variates)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def locs(self):
|
||||||
|
return np.asarray([x.kwds['loc'] for x in self.qs])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def scales(self):
|
||||||
|
return np.asarray([x.kwds['scale'] for x in self.qs])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def change_priors(cls, q, alphas):
|
||||||
|
assert len(alphas)==len(q.alphas)
|
||||||
|
return Q(locs=q.locs, scales=q.scales, alphas=alphas)
|
||||||
|
|
||||||
|
# distribucion de test
|
||||||
|
p = Q(locs=[1, 1.5], scales=[1,1], alphas=[0.8, 0.2])
|
||||||
|
|
||||||
|
# distribucion de training (mixture por clase)
|
||||||
|
q = Q(locs=[0, 0.5, 1.2], scales=[1,1,1.5], alphas=[0.1, 0.2, 0.7])
|
||||||
|
|
||||||
|
# distribucion de referencia (mixture copia de q, con pesos de clase uniformes)
|
||||||
|
r = Q.change_priors(q, alphas=[1/3, 1/3, 1/3])
|
||||||
|
|
||||||
|
|
||||||
|
integral_approx = integrate(p, q)
|
||||||
|
montecarlo_approx = montecarlo(p, q)
|
||||||
|
montecarlo_approx2 = montecarlo_importancesampling(p, q, r)
|
||||||
|
montecarlo_approx3 = montecarlo_classweight(p, q)
|
||||||
|
|
||||||
|
print(f'{integral_approx=:.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{montecarlo_approx=:.10f}, error={np.abs(integral_approx-montecarlo_approx):.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{montecarlo_approx2=:.10f}, error={np.abs(integral_approx-montecarlo_approx2):.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{montecarlo_approx3=:.10f}, error={np.abs(integral_approx-montecarlo_approx3):.10f}')
|
||||||
|
|
||||||
|
|
|
@ -1,59 +0,0 @@
|
||||||
import pickle
|
|
||||||
import numpy as np
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
import quapy as qp
|
|
||||||
from quapy.method.aggregative import EMQ, DistributionMatching, PACC, ACC, CC, PCC, HDy, OneVsAllAggregative
|
|
||||||
from method_kdey import KDEy
|
|
||||||
from method_dirichlety import DIRy
|
|
||||||
from quapy.model_selection import GridSearchQ
|
|
||||||
from quapy.protocol import UPP
|
|
||||||
|
|
||||||
SEED=1
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = 100
|
|
||||||
qp.environ['N_JOBS'] = -1
|
|
||||||
n_bags_val = 250
|
|
||||||
n_bags_test = 1000
|
|
||||||
result_dir = f'results_tweet_sensibility'
|
|
||||||
|
|
||||||
os.makedirs(result_dir, exist_ok=True)
|
|
||||||
|
|
||||||
method = 'KDEy-MLE'
|
|
||||||
|
|
||||||
global_result_path = f'{result_dir}/{method}'
|
|
||||||
|
|
||||||
if not os.path.exists(global_result_path+'.csv'):
|
|
||||||
with open(global_result_path+'.csv', 'wt') as csv:
|
|
||||||
csv.write(f'Method\tDataset\tBandwidth\tMAE\tMRAE\tKLD\n')
|
|
||||||
|
|
||||||
with open(global_result_path+'.csv', 'at') as csv:
|
|
||||||
for bandwidth in np.linspace(0.01, 0.2, 20):
|
|
||||||
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
|
|
||||||
print('init', dataset)
|
|
||||||
|
|
||||||
local_result_path = global_result_path + '_' + dataset + f'_{bandwidth:.3f}'
|
|
||||||
|
|
||||||
with qp.util.temp_seed(SEED):
|
|
||||||
|
|
||||||
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
|
|
||||||
quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10, bandwidth=bandwidth)
|
|
||||||
quantifier.fit(data.training)
|
|
||||||
protocol = UPP(data.test, repeats=n_bags_test)
|
|
||||||
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
|
|
||||||
report.to_csv(f'{local_result_path}.dataframe')
|
|
||||||
means = report.mean()
|
|
||||||
csv.write(f'{method}\t{data.name}\t{bandwidth}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
|
||||||
csv.flush()
|
|
||||||
|
|
||||||
df = pd.read_csv(global_result_path+'.csv', sep='\t')
|
|
||||||
|
|
||||||
pd.set_option('display.max_columns', None)
|
|
||||||
pd.set_option('display.max_rows', None)
|
|
||||||
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
|
|
||||||
print(pv)
|
|
|
@ -24,6 +24,7 @@ if __name__ == '__main__':
|
||||||
for method in METHODS:
|
for method in METHODS:
|
||||||
|
|
||||||
print('Init method', method)
|
print('Init method', method)
|
||||||
|
if method == 'EMQ-C': continue
|
||||||
|
|
||||||
global_result_path = f'{result_dir}/{method}'
|
global_result_path = f'{result_dir}/{method}'
|
||||||
|
|
||||||
|
@ -36,7 +37,7 @@ if __name__ == '__main__':
|
||||||
# this variable controls that the mod sel has already been done, and skip this otherwise
|
# this variable controls that the mod sel has already been done, and skip this otherwise
|
||||||
semeval_trained = False
|
semeval_trained = False
|
||||||
|
|
||||||
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST[::-1]:
|
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
|
||||||
print('init', dataset)
|
print('init', dataset)
|
||||||
|
|
||||||
local_result_path = global_result_path + '_' + dataset
|
local_result_path = global_result_path + '_' + dataset
|
||||||
|
|
|
@ -69,8 +69,8 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
quantifier = modsel.best_model()
|
quantifier = modsel.best_model()
|
||||||
except:
|
except:
|
||||||
print('something went wrong... reporting CC')
|
print('something went wrong... trying to fit the default model')
|
||||||
quantifier = qp.method.aggregative.CC(LR()).fit(train)
|
quantifier.fit(train)
|
||||||
|
|
||||||
protocol = UPP(test, repeats=n_bags_test)
|
protocol = UPP(test, repeats=n_bags_test)
|
||||||
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
import pickle
|
import pickle
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
from distribution_matching.commons import METHODS, new_method, show_results
|
from distribution_matching.commons import METHODS, new_method, show_results
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
|
@ -22,9 +25,6 @@ if __name__ == '__main__':
|
||||||
os.makedirs(result_dir, exist_ok=True)
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
for method in METHODS:
|
for method in METHODS:
|
||||||
if method == 'HDy-OvA': continue
|
|
||||||
if method == 'DIR': continue
|
|
||||||
if method != 'KDEy-ML': continue
|
|
||||||
|
|
||||||
print('Init method', method)
|
print('Init method', method)
|
||||||
|
|
||||||
|
@ -71,12 +71,14 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
quantifier = modsel.best_model()
|
quantifier = modsel.best_model()
|
||||||
except:
|
except:
|
||||||
print('something went wrong... reporting CC')
|
print('something went wrong... trying to fit the default model')
|
||||||
quantifier = qp.method.aggregative.CC(LR()).fit(train)
|
quantifier.fit(train)
|
||||||
|
# quantifier = qp.method.aggregative.CC(LogisticRegression()).fit(train)
|
||||||
|
|
||||||
|
|
||||||
protocol = UPP(test, repeats=n_bags_test)
|
protocol = UPP(test, repeats=n_bags_test)
|
||||||
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
||||||
verbose=True)
|
verbose=True, n_jobs=-1)
|
||||||
report.to_csv(f'{local_result_path}.dataframe')
|
report.to_csv(f'{local_result_path}.dataframe')
|
||||||
means = report.mean()
|
means = report.mean()
|
||||||
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
|
|
@ -79,7 +79,7 @@ if __name__ == '__main__':
|
||||||
repeats = 10
|
repeats = 10
|
||||||
error = 'mae'
|
error = 'mae'
|
||||||
|
|
||||||
div = 'HD'
|
div = 'topsoe'
|
||||||
|
|
||||||
# generates tuples (dataset, method, method_name)
|
# generates tuples (dataset, method, method_name)
|
||||||
# (the dataset is needed for methods that process the dataset differently)
|
# (the dataset is needed for methods that process the dataset differently)
|
||||||
|
|
|
@ -9,8 +9,6 @@ Change Log 0.1.8
|
||||||
- qp.evaluation now runs in parallel <improve, remove or fix the ongoing error, put at the qp. level instead of
|
- qp.evaluation now runs in parallel <improve, remove or fix the ongoing error, put at the qp. level instead of
|
||||||
qp.evaluation because I don't like the qp.evaluation.evaluate thing>
|
qp.evaluation because I don't like the qp.evaluation.evaluate thing>
|
||||||
|
|
||||||
- <fix> remove dependencies with LabelledCollection in the library.
|
|
||||||
|
|
||||||
|
|
||||||
Change Log 0.1.7
|
Change Log 0.1.7
|
||||||
----------------
|
----------------
|
||||||
|
|
|
@ -59,7 +59,7 @@ class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabi
|
||||||
elif isinstance(k, float):
|
elif isinstance(k, float):
|
||||||
if not (0 < k < 1):
|
if not (0 < k < 1):
|
||||||
raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)')
|
raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)')
|
||||||
return self.fit_cv(X, y)
|
return self.fit_tr_val(X, y)
|
||||||
|
|
||||||
def fit_cv(self, X, y):
|
def fit_cv(self, X, y):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -90,11 +90,32 @@ def TopsoeDistance(P, Q, epsilon=1e-20):
|
||||||
|
|
||||||
:param P: real-valued array-like of shape `(k,)` representing a discrete distribution
|
:param P: real-valued array-like of shape `(k,)` representing a discrete distribution
|
||||||
:param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
|
:param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
|
||||||
|
:param epsilon: small value to smooth the distributions for numerical stability
|
||||||
:return: float
|
:return: float
|
||||||
"""
|
"""
|
||||||
return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon)))
|
return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon)))
|
||||||
|
|
||||||
|
|
||||||
|
def CauchySchwarz(P, Q, epsilon=1e-20):
|
||||||
|
"""
|
||||||
|
Cauchy-Schwarz divergence between two (discretized) distributions `P` and `Q`.
|
||||||
|
The Cauchy-Schwarz divergence for two discrete distributions of `k` bins is defined as:
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
CS(P,Q) = \\frac{ \\sum_{i=1}^k p_i q_i }{
|
||||||
|
\\left( \\sum_{i=1}^k p^2_i \\right) \\left( \\sum_{i=1}^k q^2_i \\right)
|
||||||
|
}
|
||||||
|
|
||||||
|
:param P: real-valued array-like of shape `(k,)` representing a discrete distribution
|
||||||
|
:param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
|
||||||
|
:param epsilon: small value to smooth the distributions for numerical stability
|
||||||
|
:return: float
|
||||||
|
"""
|
||||||
|
P += epsilon
|
||||||
|
Q += epsilon
|
||||||
|
return - np.log(sum(P * Q) / np.sqrt(sum(P ** 2) * sum(Q ** 2)))
|
||||||
|
|
||||||
|
|
||||||
def uniform_prevalence_sampling(n_classes, size=1):
|
def uniform_prevalence_sampling(n_classes, size=1):
|
||||||
"""
|
"""
|
||||||
Implements the `Kraemer algorithm <http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf>`_
|
Implements the `Kraemer algorithm <http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf>`_
|
||||||
|
@ -276,3 +297,4 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -604,10 +604,13 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
|
|
||||||
def _get_divergence(divergence: Union[str, Callable]):
|
def _get_divergence(divergence: Union[str, Callable]):
|
||||||
if isinstance(divergence, str):
|
if isinstance(divergence, str):
|
||||||
if divergence=='HD':
|
divergence = divergence.lower()
|
||||||
|
if divergence=='hd':
|
||||||
return F.HellingerDistance
|
return F.HellingerDistance
|
||||||
elif divergence=='topsoe':
|
elif divergence=='topsoe':
|
||||||
return F.TopsoeDistance
|
return F.TopsoeDistance
|
||||||
|
elif divergence.lower()=='cs':
|
||||||
|
return F.CauchySchwarz
|
||||||
elif divergence.lower()=='l2':
|
elif divergence.lower()=='l2':
|
||||||
return lambda a,b: np.linalg.norm(a-b)
|
return lambda a,b: np.linalg.norm(a-b)
|
||||||
elif divergence.lower()=='l1':
|
elif divergence.lower()=='l1':
|
||||||
|
|
Loading…
Reference in New Issue